Update CPU kernel implementations and guard directives

Resolves COMPMID-6023

Change-Id: I868975d14c4f98af6716726feda22405a6a4c891
Signed-off-by: Michael Tyler <michael.tyler@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9686
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
index 515d55c..2d743a4 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020, 2022 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,6 +57,8 @@
 #include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp"
 #endif // ARM_COMPUTE_ENABLE_SME2
 
+#include "kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp"
+#include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp"
 #include "kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp"
 #include "kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp"
 #include "kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp"
@@ -204,6 +206,30 @@
     [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>(args); }
 ),
+GemmImplementation<bfloat16, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_ffinterleaved_bf16fp32_mmla_8x12",
+    KernelWeightFormat::VL256_BL64,
+    [](const GemmArgs &args) { return args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, float>(args); }
+),
+GemmImplementation<bfloat16, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_ffhybrid_bf16fp32_mmla_6x16",
+    KernelWeightFormat::VL256_BL64,
+    [](const GemmArgs &args) { return args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_bf16fp32_mmla_6x16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_bf16fp32_mmla_6x16, bfloat16, float>(args); }
+),
+GemmImplementation<bfloat16, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_ffinterleaved_bf16fp32_dot_8x12",
+    KernelWeightFormat::VL128_BL32,
+    [](const GemmArgs &args) { return args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>(args); }
+),
 #endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
 GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index ee567a2..44a7bb8 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -66,6 +66,10 @@
 #include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp"
 #endif // ARM_COMPUTE_ENABLE_SME2
 
+#include "kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp"
+#include "kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp"
+#include "kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp"
+#include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp"
 #include "kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp"
 #include "kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp"
 #include "kernels/sve_hybrid_fp32_mla_6x4VL.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index 19c8fca..5e77df7 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, 2022 Arm Limited.
+ * Copyright (c) 2018-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -212,9 +212,11 @@
                        instantiate(instantiate) {   }
 };
 
-/* "Main" function implemented for each valid combination of types.
- * Returns a list of GEMM implementation descriptors for processing by the
- * other functions, ended by an implementation with
+/* Provides the list of implementation descriptors which is processed by the
+ * other functions.
+ *
+ * A specialised version is provided for each supported combination of types.
+ * The end of the list is indicated by a sentinel descriptor with
  * method==GemmMethod::DEFAULT.  */
 template<typename Top, typename Tret, class OutputStage = Nothing>
 const GemmImplementation<Top, Tret, OutputStage> *gemm_implementation_list();
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index 18d8fc9..aa6ecc2 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020, 2022 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,6 +57,7 @@
 /* Explicitly instantiate the external functions for these types. */
 template UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
 template bool has_opt_gemm<int16_t, int32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<int16_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
 template std::vector<KernelDescription> get_compatible_kernels<int16_t, int32_t, Nothing> (const GemmArgs &args, const Nothing &);
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index b0a0188..fd20e53 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -186,6 +186,7 @@
 /* Explicitly instantiate the external functions for these types. */
 template UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
 template bool has_opt_gemm<int8_t, int32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<int8_t, int32_t, Nothing>(const GemmArgs &args, const Nothing &);
 template std::vector<KernelDescription> get_compatible_kernels<int8_t, int32_t, Nothing> (const GemmArgs &args, const Nothing &);
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
deleted file mode 100644
index b71f390..0000000
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
+++ /dev/null
@@ -1,566 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "arm_gemm.hpp"
-#include "utils.hpp"
-
-#include "mergeresults.hpp"
-#include "transform.hpp"
-
-#ifdef CYCLE_PROFILING
-#include "profiler.hpp"
-#endif
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-
-// Some macros used to decide how much working space to allocate.
-// Round allocations up to the next cache line.
-#define ALLOC_ROUND    64
-#define ROUND_UP(x)    ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
-
-// Implementation of the GemmCommon abstract class.
-//
-// This implementation interleaves the source matrices in blocks - good for
-// larger matrices.
-namespace arm_gemm {
-
-template<typename strategy, typename To, typename Tr>
-class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> {
-    typedef typename strategy::operand_type Toi;
-    typedef typename strategy::result_type Tri;
-
-    /* const properties set by constructor */
-    const CPUInfo * const _ci;
-
-    const unsigned int _Msize;
-    const unsigned int _Nsize;
-    const unsigned int _Ksize;
-
-    const unsigned int _nbatches;
-    const unsigned int _nmulti;
-
-    const Activation _act;
-
-    const int _maxthreads;
-    int _nthreads;
-
-    /* Blocking info */
-    unsigned int _k_block=0;
-    unsigned int _x_block=0;
-
-    unsigned int _Mround_div=0;
-    unsigned int _Mround=0;
-    unsigned int _Nround_div=0;
-    unsigned int _Nround=0;
-
-    /* Working space, pretransposed buffer */
-    const Toi *_B_transposed=nullptr;
-    void *_working_space=nullptr;
-
-    /* We will need to walk through the blocks of B in a few contexts, so
-     * factor that out.  */
-    class blockwalker {
-    private:
-        /* Size loops, etc. based on our parent's configuration */
-        const GemmInterleavedPretransposed2d<strategy, To, Tr> &_parent;
-
-        /* K, X and multi parameters for current iteration. */
-        unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0;
-
-        unsigned int _index=0;
-        bool _done=false;
-        bool _newkblock=true;
-        bool _newmulti=true;
-
-    public:
-        blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent)
-        : _parent(parent)
-        , _xmax { parent._Nsize }
-        { }
-
-        blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax)
-        : _parent(parent)
-        , _x0   { x0   }
-        , _xmin { x0   }
-        , _xmax { xmax }
-        {
-            assert(_x0 <= _xmax);
-        }
-
-        unsigned int xmax() {
-            return std::min(_x0 + _parent._x_block, _xmax);
-        }
-
-        unsigned int kmax() {
-            return std::min(_k0 + _parent._k_block, _parent._Ksize);
-        }
-
-        /* Advance to the next block, return false at the end. */
-        bool advance(void) {
-            if (_done) {
-                return false;
-            }
-
-            _newkblock=false;
-            _x0 += _parent._x_block;
-            if (_x0 >= _xmax) {
-                _x0=_xmin;
-                _k0 += _parent._k_block;
-                if (_k0 >= _parent._Ksize) {
-                    _k0=0;
-                    _multi++;
-                    if (_multi >= _parent._nmulti) {
-                        _done=true;
-                        return false;
-                    }
-                    _newmulti=true;
-                }
-                _newkblock=true;
-            }
-            _index++;
-
-            return true;
-        }
-
-        unsigned int k0(void) { return _k0; }
-        unsigned int x0(void) { return _x0; }
-        unsigned int multi(void) { return _multi; }
-        unsigned int index(void) { return _index; }
-        bool done(void) { return _done; }
-        bool newkblock(void) { return _newkblock; }
-    };
-
-    // A working size: One of these needed, regardless of thread count.  Divided according to window.
-    size_t get_a_working_size() const {
-        return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2;
-    }
-
-    // As B will be pretranspose we do not need to alloc any space for it
-    size_t get_b_working_size() const {
-        return 0;
-    }
-
-    // C working size: One needed per thread.
-    size_t get_c_working_size() const {
-        return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
-    }
-
-    // Internal execute function.
-    // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
-    void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int, int) {
-        /* Make sure we've been set up correctly. */
-        assert(_B_transposed);
-        assert(_working_space);
-        assert(this->_Aptr);
-        assert(this->_Cptr);
-
-#ifdef CYCLE_PROFILING
-        profiler prof;
-#endif
-        strategy strat(_ci);
-
-        /* Translate 'start' and 'end' into a position within the batches and rows. */
-        const unsigned int window_per_batch = _Mround / strategy::out_height();
-        unsigned int batch_0   = m_start / window_per_batch;
-        unsigned int batch_end = m_end   / window_per_batch;
-
-        /* Compute the M values to operate on */
-        unsigned int m_0   = (m_start - (batch_0 * window_per_batch)) * strategy::out_height();
-        unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height();
-
-        unsigned int n_0   = std::min(this->_Nsize, strategy::out_width() * n_start);
-        unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end);
-
-        blockwalker current(*this, n_0, n_max);
-
-        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
-
-        auto c_panel_start = working_space_bytes;
-        auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads;
-
-        auto c_panel = reinterpret_cast<Tri *>(c_panel_start + get_c_working_size() * threadid);
-        auto a_panel = reinterpret_cast<Toi *>(a_panel_start + get_a_working_size() * threadid);
-
-        /* B^t is stored in interleaved panels separated by their K-block component
-         * we want to store a pointer to the start of the current k-page
-         * then when we come to the next k-block we just add the size of the previous to
-         * this base pointer
-         */
-        const Toi *b_panel_start = _B_transposed;
-        // b_panels stores a pointer to the start of our current block inside of the k-block
-        const Toi *b_panel       = b_panel_start;
-
-        // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
-        unsigned b_page_size = 0;
-        int kern_k = 0;
-        for (;!current.done();current.advance()) {
-            int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
-
-            if (current.newkblock()) {
-                kern_k         = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
-                kern_k        *= strat.k_unroll();
-
-                unsigned b_thread_start_offset = iceildiv(current.x0(), strategy::out_width());
-
-                b_panel_start += b_page_size;
-                b_panel        = b_panel_start + (b_thread_start_offset * strat.out_width() * kern_k);
-                b_page_size    = _Nround * kern_k;
-
-                for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
-                    unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
-                    unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
-
-                    if (first_m >= last_m)
-                        continue;
-
-                    auto a_thread_panel_in  = this->_Aptr
-                                            + (batch * this->_A_batch_stride)
-                                            + (current.multi() * this->_A_multi_stride);
-
-                    auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block);
-
-                    strat.transforms.PrepareA(
-                        a_thread_panel_out,
-                        a_thread_panel_in,
-                        this->_lda,
-                        first_m,
-                        last_m,
-                        current.k0(),
-                        current.kmax(),
-                        0);
-                }
-            }
-
-            /* Do the actual work. */
-            for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
-                unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
-                unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
-
-                const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
-
-                if (first_m >= last_m)
-                    continue;
-
-                for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
-                    unsigned int ymax = std::min(_Msize, y + strategy::out_height());
-
-                    strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
-                    a_ptr += (strategy::out_height() * kern_k);
-
-                    /* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */
-                    const bool first_pass = current.k0()==0;
-                    const bool last_pass  = current.kmax()==_Ksize;
-
-                    auto c_panel_out = this->_Cptr
-                                     + this->_C_batch_stride * batch
-                                     + this->_C_multi_stride * current.multi();
-
-                    auto bias        = (first_pass && this->_bias)
-                                     ? this->_bias + (current.multi() * this->_bias_multi_stride)
-                                     : nullptr;
-
-                    auto act        = last_pass ? _act : Activation();
-
-                    strat.transforms.Merge(
-                        c_panel_out,
-                        c_panel,
-                        this->_ldc,
-                        y,
-                        ymax,
-                        current.x0(),
-                        current.xmax(),
-                        bias,
-                        act,
-                        !first_pass);  //Append
-                }
-            }
-
-            b_panel += (bblocks * strat.out_width() * kern_k);
-        }
-    }
-
-    static unsigned int get_k_block_size(const GemmArgs &args) {
-        // Work out blocking parameters, or override from provided GemmConfig
-        if (args._cfg && args._cfg->inner_block_size) {
-            return args._cfg->inner_block_size;
-        }
-
-        const unsigned int L1_size = args._ci->get_L1_cache_size();
-        unsigned int k_block;
-
-        // k_block: Find out how much of the larger array can be loaded into half the cache.
-        // This should account for associative caches.
-        k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
-        // Needs to be (at least a single) multiple of the K unroll level.
-        k_block /= strategy::k_unroll();
-        k_block = std::max(k_block, 1U) * strategy::k_unroll();
-
-        // Now tune to presented problem size; this is how many blocks we need.
-        unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
-
-        // So divide the space equally into that many blocks.
-        k_block = iceildiv(args._Ksize, numk_blocks);
-
-        // And round UP to the K unroll level required.
-        k_block = iceildiv(k_block, strategy::k_unroll());
-        k_block *= strategy::k_unroll();
-
-        return k_block;
-    }
-
-public:
-    GemmInterleavedPretransposed2d(GemmInterleavedPretransposed2d &) = delete;
-    GemmInterleavedPretransposed2d & operator= (GemmInterleavedPretransposed2d &) = delete;
-
-    /* Constructor */
-    GemmInterleavedPretransposed2d(const GemmArgs &args)
-    :    _ci(args._ci)
-    ,    _Msize(args._Msize)
-    ,    _Nsize(args._Nsize)
-    ,    _Ksize(args._Ksize)
-    ,    _nbatches(args._nbatches)
-    ,    _nmulti(args._nmulti)
-    ,    _act(args._act)
-    ,    _maxthreads(args._maxthreads)
-    ,    _nthreads(args._maxthreads)
-    ,    _k_block(get_k_block_size(args))
-    // Work out the rounded size of M - needed for some buffers.
-    ,    _Mround_div ( iceildiv(_Msize, strategy::out_height()) )
-    ,    _Mround     ( _Mround_div * strategy::out_height()     )
-
-    ,    _Nround_div ( iceildiv(_Nsize, strategy::out_width()) )
-    ,    _Nround     ( _Nround_div * strategy::out_width()     )
-    {
-        assert(_maxthreads > 0);
-
-        const unsigned int L2_size = _ci->get_L2_cache_size();
-
-        if (args._cfg && args._cfg->outer_block_size) {
-            _x_block = args._cfg->outer_block_size;
-        } else {
-            // x_block: Work out how many rows (of length k_block) will fit in the L2
-            // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-            _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
-                      (sizeof(Toi) * _k_block);
-
-            // Needs to be (at least a single) multiple of the kernel output width.
-            _x_block /= strategy::out_width();
-            _x_block = std::max(_x_block, 1U) * strategy::out_width();
-
-            // And tune to the presented problem size.
-            unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
-            _x_block = iceildiv(_Nsize, num_x_blocks);
-
-            _x_block = iceildiv(_x_block, strategy::out_width());
-            _x_block *= strategy::out_width();
-        }
-    }
-
-    // Interface implementation - Compulsory functions
-    ndrange_t get_window_size() const override {
-        unsigned m = (_Mround / strategy::out_height()) * _nbatches;
-        unsigned n = _Nround_div;
-
-        return { m, n };
-    }
-
-    bool supports_dynamic_scheduling() const override {
-        return true;
-    }
-
-    // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
-    void set_nthreads(int nthreads) override {
-        _nthreads = std::min(nthreads, _maxthreads);
-    }
-
-    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
-        /* This particular GEMM implementation can only be broken up over the M & N
-         * dimensions, we inform the frame work of this limitation via the get_window_size function
-         */
-        const auto m_start = work_range.get_position(0);
-        const auto n_start = work_range.get_position(1);
-        const auto m_size  = work_range.get_size(0);
-        const auto n_size  = work_range.get_size(1);
-        const auto m_end   = m_start + m_size;
-        const auto n_end   = n_start + n_size;
-
-        const auto m_threadid = thread_locator.get_position(0);
-        const auto n_threadid = thread_locator.get_position(1);
-
-        execute_pretranspose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid);
-    }
-
-    std::size_t get_working_size() const override {
-        /* Because we do not know how schedular will break up
-         * the task, we need to ensure that alloc enough
-         * space to be able to handle the case where every thread
-         * is parallelised across B AND also every thrread is parallelised across A
-         *
-         * If we parallelise across A, then we only need one buffer of A and 64 buffers of B
-         * If we parallelise across B, then we only need 64 buffer of B and
-         */
-        return get_c_working_size() * _maxthreads
-             + get_a_working_size() * _maxthreads
-             + 64; //to account for cacheline alignment
-    }
-
-
-    void set_working_space(void *working_space) override {
-        // Make sure everything ends up cache line aligned
-        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
-        intptr_t working_space_int  = reinterpret_cast<intptr_t>(working_space);
-
-        size_t diff=0;
-
-        if (working_space_int & 0x3F) {
-            diff = 0x40 - (working_space_int & 0x3F);
-        }
-
-        working_space_bytes += diff;
-
-        _working_space = reinterpret_cast<void *>(working_space_bytes);
-    }
-
-    // Interface implementation - pretransposed
-    bool B_is_pretransposed() const override {
-        return true;
-    }
-
-    bool B_pretranspose_required() const override {
-        return _B_transposed==nullptr;
-    }
-
-    // TODO: this could almost certainly be considerably simpler.
-    size_t get_B_pretransposed_array_size() const override {
-        size_t total=0;
-        blockwalker current(*this);
-
-        do {
-            /* Figure out the size of each block. */
-            unsigned int x_size = (current.xmax() - current.x0());
-            unsigned int k_size = (current.kmax() - current.k0());
-
-            /* Round sizes up as needed. */
-            x_size = iceildiv(x_size, strategy::out_width());
-            x_size *= strategy::out_width();
-
-            k_size = iceildiv(k_size, strategy::k_unroll());
-            k_size *= strategy::k_unroll();
-
-            total += x_size * k_size * sizeof(Toi);
-        } while (current.advance());
-
-        return total;
-    }
-
-    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
-        blockwalker current(*this);
-        Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
-        _B_transposed = buffer;
-        strategy strat(_ci);
-
-        do {
-            /* Figure out the size of each block. */
-            unsigned int x_size = (current.xmax() - current.x0());
-            unsigned int k_size = (current.kmax() - current.k0());
-
-            /* Round sizes up as needed. */
-            x_size = iceildiv(x_size, strategy::out_width());
-            x_size *= strategy::out_width();
-
-            k_size = iceildiv(k_size, strategy::k_unroll());
-            k_size *= strategy::k_unroll();
-
-            strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
-                                      current.x0(), current.xmax(), current.k0(), current.kmax());
-
-            buffer += (x_size * k_size);
-        } while (current.advance());
-    }
-
-    void set_pretransposed_B_data(void *in_buffer) override {
-        _B_transposed = reinterpret_cast<Toi *>(in_buffer);
-    }
-
-    // Estimate cycles for given problem given provided parameters
-    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
-        unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
-        unsigned int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches;
-        unsigned int n_blocks = iceildiv(args._Nsize, strategy::out_width());
-
-        uint64_t total_macs    = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
-        uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi);
-        uint64_t merge_bytes   = static_cast<uint64_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
-
-        // Wide problems incur extra preparation cost, as it is done per thread.
-        // Duplicate the logic the scheduler will later use to figure out how much that will affect us
-        float ratio = m_blocks / static_cast<float>(n_blocks);
-
-        unsigned int ideal_height = static_cast<unsigned int>(std::sqrt(args._maxthreads * ratio) + 0.5);
-        unsigned int height = 1;
-
-        if (ideal_height == 0) {
-            height = 1;
-        } else {
-            for (unsigned int adj=0; adj<ideal_height; adj++) {
-                const unsigned int round_down = ideal_height - adj;
-                if (args._maxthreads % round_down == 0) {
-                    height = round_down;
-                    break;
-                }
-
-                const unsigned int round_up = ideal_height + adj;
-                if (args._maxthreads % round_up == 0) {
-                    height = round_up;
-                    break;
-                }
-            }
-        }
-
-        // We've computed the height here - we need to multiply the amount of preparation effort by the width (which is total threads / height)
-        prepare_bytes *= (args._maxthreads / height);
-
-        float mac_cycles     = static_cast<float>(total_macs) / params.kernel_macs_cycle;
-        float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
-        float merge_cycles   = static_cast<float>(merge_bytes) / params.merge_bytes_cycle;
-
-        float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
-
-        // We can't thread over multis, which might be a problem in some
-        // threaded cases.  Penalize that here.
-        float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches * iceildiv(args._Nsize, strategy::out_width())) * 0.9;
-
-        if (parallelism_available < args._maxthreads) {
-            total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available);
-        }
-
-        return static_cast<uint64_t>(total_cycles);
-    }
-};
-
-} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
index 9e8907d..c725815 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
@@ -80,7 +80,7 @@
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sme2_interleaved_nomerge_s8q_mopa_1VLx4VL",
-    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
     [](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length<int32_t>();
                                return args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL, int8_t, int8_t>(args, qp); }
@@ -88,7 +88,7 @@
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sme2_interleaved_nomerge_s8q_mopa_4VLx1VL",
-    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
     [](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length<int32_t>();
                                return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL, int8_t, int8_t>(args, qp); }
@@ -96,7 +96,7 @@
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sme2_interleaved_nomerge_s8q_mopa_2VLx2VL",
-    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
     nullptr,
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL, int8_t, int8_t>(args, qp); }
 },
@@ -265,6 +265,7 @@
 
 template UniqueGemmCommon<int8_t, int8_t> gemm<int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
 template bool has_opt_gemm<int8_t, int8_t, Requantize32>(WeightFormat &weight_format, const GemmArgs &args, const Requantize32 &os);
+template KernelDescription get_gemm_method<int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
 template std::vector<KernelDescription> get_compatible_kernels<int8_t, int8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
index f93f56b..6254ec6 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
@@ -76,7 +76,7 @@
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sme2_interleaved_nomerge_u8q_mopa_1VLx4VL",
-    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
     [](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length<uint32_t>();
                                return args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL, uint8_t, uint8_t>(args, qp); }
@@ -84,7 +84,7 @@
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sme2_interleaved_nomerge_u8q_mopa_4VLx1VL",
-    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
     [](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length<int32_t>();
                                return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL, uint8_t, uint8_t>(args, qp); }
@@ -92,7 +92,7 @@
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sme2_interleaved_nomerge_u8q_mopa_2VLx2VL",
-    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));},
     nullptr,
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline<cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL, uint8_t, uint8_t>(args, qp); }
 },
@@ -233,6 +233,7 @@
 
 template UniqueGemmCommon<uint8_t, uint8_t> gemm<uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
 template bool has_opt_gemm<uint8_t, uint8_t, Requantize32>(WeightFormat &weight_format, const GemmArgs &args, const Requantize32 &os);
+template KernelDescription get_gemm_method<uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
 template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint8_t, Requantize32>(const GemmArgs &args, const Requantize32 &os);
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index fc836f9..25b6cf0 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020, 2022 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,6 +57,7 @@
 /* Explicitly instantiate the external functions for these types. */
 template UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
 template bool has_opt_gemm<uint16_t, uint32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
 template std::vector<KernelDescription> get_compatible_kernels<uint16_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index fcc95eb..af5cfbb 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -150,6 +150,7 @@
 /* Explicitly instantiate the external functions for these types. */
 template UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
 template bool has_opt_gemm<uint8_t, uint32_t, Nothing>(WeightFormat &weight_format, const GemmArgs &args, const Nothing &);
+template KernelDescription get_gemm_method<uint8_t, uint32_t, Nothing>(const GemmArgs &args, const Nothing &);
 template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint32_t, Nothing> (const GemmArgs &args, const Nothing &);
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
index 4dfe464..e4bfc0f 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
@@ -170,7 +170,6 @@
       "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "12:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
       : "cc", "memory", "v16", "v17", "v18", "v19", "x20", "x21", "x22", "x23"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
index 56ca49a..23800ed 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
@@ -210,8 +210,8 @@
       "sadalp v22.4s, v26.8h\n"
       "sadalp v21.4s, v25.8h\n"
       "addp v24.4s, v24.4s, v23.4s\n"
-      "addp v23.4s, v22.4s, v21.4s\n"
-      "addp v24.4s, v24.4s, v23.4s\n"
+      "addp v16.4s, v22.4s, v21.4s\n"
+      "addp v24.4s, v24.4s, v16.4s\n"
       "add v24.4s, v24.4s, v20.4s\n"
       "str q24, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
index 4c7bb71..15545c2 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
@@ -210,8 +210,8 @@
       "uadalp v22.4s, v26.8h\n"
       "uadalp v21.4s, v25.8h\n"
       "addp v24.4s, v24.4s, v23.4s\n"
-      "addp v23.4s, v22.4s, v21.4s\n"
-      "addp v24.4s, v24.4s, v23.4s\n"
+      "addp v16.4s, v22.4s, v21.4s\n"
+      "addp v24.4s, v24.4s, v16.4s\n"
       "add v24.4s, v24.4s, v20.4s\n"
       "str q24, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
index 2ba2aa8..b900c33 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
@@ -80,36 +80,36 @@
       "prfm pldl1keep, [x21, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr d28, [x28], #0x8\n"
-      "ldr d27, [x27], #0x8\n"
-      "shll v28.4s, v28.4h, #0x10\n"
+      "ldr d27, [x28], #0x8\n"
+      "ldr d26, [x27], #0x8\n"
       "shll v27.4s, v27.4h, #0x10\n"
+      "shll v26.4s, v26.4h, #0x10\n"
       "ldr d22, [x26], #0x8\n"
       "ldr d21, [x25], #0x8\n"
       "shll v22.4s, v22.4h, #0x10\n"
       "shll v21.4s, v21.4h, #0x10\n"
-      "ldr d26, [x24], #0x8\n"
+      "ldr d20, [x24], #0x8\n"
       "ldr d25, [x23], #0x8\n"
-      "shll v26.4s, v26.4h, #0x10\n"
-      "shll v25.4s, v25.4h, #0x10\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d19, [x21], #0x8\n"
       "shll v20.4s, v20.4h, #0x10\n"
+      "shll v25.4s, v25.4h, #0x10\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d16, [x21], #0x8\n"
       "shll v19.4s, v19.4h, #0x10\n"
-      "zip1 v24.4s, v28.4s, v22.4s\n"
-      "zip1 v23.4s, v27.4s, v21.4s\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "zip1 v24.4s, v27.4s, v22.4s\n"
+      "zip1 v23.4s, v26.4s, v21.4s\n"
       "subs %x[width], %x[width], #0x4\n"
       "cmp %x[width], #0x4\n"
-      "zip1 v18.4s, v26.4s, v20.4s\n"
-      "zip1 v17.4s, v25.4s, v19.4s\n"
+      "zip1 v18.4s, v20.4s, v19.4s\n"
+      "zip1 v17.4s, v25.4s, v16.4s\n"
       "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip2 v22.4s, v28.4s, v22.4s\n"
-      "zip2 v21.4s, v27.4s, v21.4s\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "zip2 v21.4s, v26.4s, v21.4s\n"
       "prfm pldl1keep, [x26, #0x70]\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip2 v20.4s, v26.4s, v20.4s\n"
-      "zip2 v19.4s, v25.4s, v19.4s\n"
+      "zip2 v20.4s, v20.4s, v19.4s\n"
+      "zip2 v19.4s, v25.4s, v16.4s\n"
       "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
       "prfm pldl1keep, [x22, #0x70]\n"
@@ -138,71 +138,70 @@
       "ldr s28, [x28], #0x4\n"
       "ldr s27, [x27], #0x4\n"
       "mov x20, #0x2\n"
-      "ldr s22, [x26], #0x4\n"
-      "ldr s21, [x25], #0x4\n"
-      "ldr s26, [x24], #0x4\n"
-      "ldr s25, [x23], #0x4\n"
-      "ldr s20, [x22], #0x4\n"
-      "ldr s19, [x21], #0x4\n"
+      "ldr s26, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s24, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s22, [x22], #0x4\n"
+      "ldr s21, [x21], #0x4\n"
       "tbz %x[width], #0, 5f\n"
       "ld1 { v28.h }[2], [x28]\n"
       "ld1 { v27.h }[2], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v22.h }[2], [x26]\n"
-      "ld1 { v21.h }[2], [x25]\n"
-      "ld1 { v26.h }[2], [x24]\n"
-      "ld1 { v25.h }[2], [x23]\n"
-      "ld1 { v20.h }[2], [x22]\n"
-      "ld1 { v19.h }[2], [x21]\n"
+      "ld1 { v26.h }[2], [x26]\n"
+      "ld1 { v25.h }[2], [x25]\n"
+      "ld1 { v24.h }[2], [x24]\n"
+      "ld1 { v23.h }[2], [x23]\n"
+      "ld1 { v22.h }[2], [x22]\n"
+      "ld1 { v21.h }[2], [x21]\n"
       "b 5f\n"
       "4:"  // odd_loads_1_0
       "ldr h28, [x28, #0x0]\n"
       "ldr h27, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr h22, [x26, #0x0]\n"
-      "ldr h21, [x25, #0x0]\n"
-      "ldr h26, [x24, #0x0]\n"
-      "ldr h25, [x23, #0x0]\n"
-      "ldr h20, [x22, #0x0]\n"
-      "ldr h19, [x21, #0x0]\n"
+      "ldr h26, [x26, #0x0]\n"
+      "ldr h25, [x25, #0x0]\n"
+      "ldr h24, [x24, #0x0]\n"
+      "ldr h23, [x23, #0x0]\n"
+      "ldr h22, [x22, #0x0]\n"
+      "ldr h21, [x21, #0x0]\n"
       "5:"  // Odd load end
       "shll v28.4s, v28.4h, #0x10\n"
       "shll v27.4s, v27.4h, #0x10\n"
       "subs x20, x20, #0x1\n"
-      "shll v22.4s, v22.4h, #0x10\n"
-      "shll v21.4s, v21.4h, #0x10\n"
       "shll v26.4s, v26.4h, #0x10\n"
       "shll v25.4s, v25.4h, #0x10\n"
-      "shll v20.4s, v20.4h, #0x10\n"
-      "shll v19.4s, v19.4h, #0x10\n"
-      "zip1 v24.4s, v28.4s, v22.4s\n"
-      "zip1 v23.4s, v27.4s, v21.4s\n"
-      "zip1 v18.4s, v26.4s, v20.4s\n"
-      "zip1 v17.4s, v25.4s, v19.4s\n"
-      "zip1 v16.4s, v24.4s, v23.4s\n"
+      "shll v24.4s, v24.4h, #0x10\n"
+      "shll v23.4s, v23.4h, #0x10\n"
+      "shll v22.4s, v22.4h, #0x10\n"
+      "shll v21.4s, v21.4h, #0x10\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "zip1 v19.4s, v27.4s, v25.4s\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "zip1 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 6f\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v16.4s, v24.4s, v23.4s\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip2 v17.4s, v18.4s, v17.4s\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 6f\n"
-      "zip2 v22.4s, v28.4s, v22.4s\n"
-      "zip2 v21.4s, v27.4s, v21.4s\n"
-      "zip2 v20.4s, v26.4s, v20.4s\n"
-      "zip2 v19.4s, v25.4s, v19.4s\n"
-      "zip1 v16.4s, v22.4s, v21.4s\n"
+      "zip2 v19.4s, v28.4s, v26.4s\n"
+      "zip2 v16.4s, v27.4s, v25.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v19.4s, v16.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip1 v18.4s, v20.4s, v19.4s\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "6:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
       : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
index f55c2be..e54b3b9 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
@@ -80,33 +80,33 @@
       "blt 3f\n"
       "2:"  // Main loop head
       "ldr q25, [x28], #0x10\n"
-      "ldr q30, [x27], #0x10\n"
+      "ldr q27, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x8\n"
       "cmp %x[width], #0x8\n"
-      "ldr q29, [x26], #0x10\n"
-      "ldr q28, [x25], #0x10\n"
+      "ldr q26, [x26], #0x10\n"
+      "ldr q24, [x25], #0x10\n"
       "ldr q21, [x24], #0x10\n"
-      "ldr q27, [x23], #0x10\n"
+      "ldr q20, [x23], #0x10\n"
       "zip1 v23.8h, v25.8h, v21.8h\n"
-      "zip1 v26.8h, v30.8h, v27.8h\n"
-      "ldr q20, [x22], #0x10\n"
-      "ldr q22, [x21], #0x10\n"
-      "zip1 v19.8h, v29.8h, v20.8h\n"
-      "zip1 v18.8h, v28.8h, v22.8h\n"
+      "zip1 v22.8h, v27.8h, v20.8h\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      "zip1 v19.8h, v26.8h, v17.8h\n"
+      "zip1 v18.8h, v24.8h, v16.8h\n"
       "zip2 v25.8h, v25.8h, v21.8h\n"
-      "zip2 v21.8h, v29.8h, v20.8h\n"
+      "zip2 v21.8h, v26.8h, v17.8h\n"
       "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip2 v20.8h, v30.8h, v27.8h\n"
-      "zip2 v16.8h, v28.8h, v22.8h\n"
+      "zip2 v20.8h, v27.8h, v20.8h\n"
+      "zip2 v16.8h, v24.8h, v16.8h\n"
       "prfm pldl1keep, [x26, #0x70]\n"
       "prfm pldl1keep, [x25, #0x70]\n"
       "zip1 v24.8h, v23.8h, v19.8h\n"
-      "zip1 v17.8h, v26.8h, v18.8h\n"
+      "zip1 v17.8h, v22.8h, v18.8h\n"
       "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
       "zip2 v23.8h, v23.8h, v19.8h\n"
-      "zip2 v19.8h, v26.8h, v18.8h\n"
+      "zip2 v19.8h, v22.8h, v18.8h\n"
       "prfm pldl1keep, [x22, #0x70]\n"
       "prfm pldl1keep, [x21, #0x70]\n"
       "zip1 v22.8h, v25.8h, v21.8h\n"
@@ -134,132 +134,131 @@
       "3:"  // Main loop skip
       "cbz %x[width], 8f\n"
       "tbz %x[width], #2, 5f\n"
-      "ldr d25, [x28], #0x8\n"
-      "ldr d30, [x27], #0x8\n"
-      "ldr d29, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d21, [x24], #0x8\n"
-      "ldr d27, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
+      "ldr d30, [x28], #0x8\n"
+      "ldr d29, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d26, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v25.s }[2], [x28], #0x4\n"
-      "ld1 { v30.s }[2], [x27], #0x4\n"
+      "ld1 { v30.s }[2], [x28], #0x4\n"
+      "ld1 { v29.s }[2], [x27], #0x4\n"
       "mov x20, #0x6\n"
-      "ld1 { v29.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v21.s }[2], [x24], #0x4\n"
-      "ld1 { v27.s }[2], [x23], #0x4\n"
-      "ld1 { v20.s }[2], [x22], #0x4\n"
-      "ld1 { v22.s }[2], [x21], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v27.s }[2], [x25], #0x4\n"
+      "ld1 { v26.s }[2], [x24], #0x4\n"
+      "ld1 { v25.s }[2], [x23], #0x4\n"
+      "ld1 { v24.s }[2], [x22], #0x4\n"
+      "ld1 { v23.s }[2], [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v25.h }[6], [x28]\n"
-      "ld1 { v30.h }[6], [x27]\n"
+      "ld1 { v30.h }[6], [x28]\n"
+      "ld1 { v29.h }[6], [x27]\n"
       "mov x20, #0x7\n"
-      "ld1 { v29.h }[6], [x26]\n"
-      "ld1 { v28.h }[6], [x25]\n"
-      "ld1 { v21.h }[6], [x24]\n"
-      "ld1 { v27.h }[6], [x23]\n"
-      "ld1 { v20.h }[6], [x22]\n"
-      "ld1 { v22.h }[6], [x21]\n"
+      "ld1 { v28.h }[6], [x26]\n"
+      "ld1 { v27.h }[6], [x25]\n"
+      "ld1 { v26.h }[6], [x24]\n"
+      "ld1 { v25.h }[6], [x23]\n"
+      "ld1 { v24.h }[6], [x22]\n"
+      "ld1 { v23.h }[6], [x21]\n"
       "b 7f\n"
       "4:"  // odd_loads_1_4
       "mov x20, #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v25.h }[4], [x28]\n"
-      "ld1 { v30.h }[4], [x27]\n"
+      "ld1 { v30.h }[4], [x28]\n"
+      "ld1 { v29.h }[4], [x27]\n"
       "mov x20, #0x5\n"
-      "ld1 { v29.h }[4], [x26]\n"
-      "ld1 { v28.h }[4], [x25]\n"
-      "ld1 { v21.h }[4], [x24]\n"
-      "ld1 { v27.h }[4], [x23]\n"
-      "ld1 { v20.h }[4], [x22]\n"
-      "ld1 { v22.h }[4], [x21]\n"
+      "ld1 { v28.h }[4], [x26]\n"
+      "ld1 { v27.h }[4], [x25]\n"
+      "ld1 { v26.h }[4], [x24]\n"
+      "ld1 { v25.h }[4], [x23]\n"
+      "ld1 { v24.h }[4], [x22]\n"
+      "ld1 { v23.h }[4], [x21]\n"
       "b 7f\n"
       "5:"  // odd_loads_2_0
       "tbz %x[width], #1, 6f\n"
-      "ldr s25, [x28], #0x4\n"
-      "ldr s30, [x27], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
       "mov x20, #0x2\n"
-      "ldr s29, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s21, [x24], #0x4\n"
-      "ldr s27, [x23], #0x4\n"
-      "ldr s20, [x22], #0x4\n"
-      "ldr s22, [x21], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v25.h }[2], [x28]\n"
-      "ld1 { v30.h }[2], [x27]\n"
+      "ld1 { v30.h }[2], [x28]\n"
+      "ld1 { v29.h }[2], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v29.h }[2], [x26]\n"
-      "ld1 { v28.h }[2], [x25]\n"
-      "ld1 { v21.h }[2], [x24]\n"
-      "ld1 { v27.h }[2], [x23]\n"
-      "ld1 { v20.h }[2], [x22]\n"
-      "ld1 { v22.h }[2], [x21]\n"
+      "ld1 { v28.h }[2], [x26]\n"
+      "ld1 { v27.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x24]\n"
+      "ld1 { v25.h }[2], [x23]\n"
+      "ld1 { v24.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
       "b 7f\n"
       "6:"  // odd_loads_1_0
-      "ldr h25, [x28, #0x0]\n"
-      "ldr h30, [x27, #0x0]\n"
+      "ldr h30, [x28, #0x0]\n"
+      "ldr h29, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr h29, [x26, #0x0]\n"
-      "ldr h28, [x25, #0x0]\n"
-      "ldr h21, [x24, #0x0]\n"
-      "ldr h27, [x23, #0x0]\n"
-      "ldr h20, [x22, #0x0]\n"
-      "ldr h22, [x21, #0x0]\n"
+      "ldr h28, [x26, #0x0]\n"
+      "ldr h27, [x25, #0x0]\n"
+      "ldr h26, [x24, #0x0]\n"
+      "ldr h25, [x23, #0x0]\n"
+      "ldr h24, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
       "7:"  // Odd load end
-      "zip1 v23.8h, v25.8h, v21.8h\n"
-      "zip1 v19.8h, v29.8h, v20.8h\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
       "subs x20, x20, #0x1\n"
-      "zip1 v26.8h, v30.8h, v27.8h\n"
-      "zip1 v18.8h, v28.8h, v22.8h\n"
-      "zip1 v24.8h, v23.8h, v19.8h\n"
-      "zip1 v17.8h, v26.8h, v18.8h\n"
-      "zip1 v16.8h, v24.8h, v17.8h\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v16.8h, v24.8h, v17.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v23.8h, v23.8h, v19.8h\n"
-      "zip2 v19.8h, v26.8h, v18.8h\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
       "subs x20, x20, #0x1\n"
-      "zip1 v17.8h, v23.8h, v19.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 8f\n"
-      "subs x20, x20, #0x1\n"
-      "zip2 v16.8h, v23.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v25.8h, v25.8h, v21.8h\n"
-      "zip2 v21.8h, v29.8h, v20.8h\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v20.8h, v30.8h, v27.8h\n"
-      "zip2 v16.8h, v28.8h, v22.8h\n"
-      "zip1 v22.8h, v25.8h, v21.8h\n"
-      "zip1 v18.8h, v20.8h, v16.8h\n"
-      "zip1 v19.8h, v22.8h, v18.8h\n"
-      "str q19, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v18.8h, v22.8h, v18.8h\n"
-      "str q18, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v21.8h, v25.8h, v21.8h\n"
-      "zip2 v20.8h, v20.8h, v16.8h\n"
-      "zip1 v17.8h, v21.8h, v20.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "8:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
       : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
index f64db0b..3a5dcf4 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
@@ -79,36 +79,36 @@
       "prfm pldl1keep, [x21, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr d28, [x28], #0x8\n"
-      "ldr d27, [x27], #0x8\n"
-      "fcvtl v28.4s, v28.4h\n"
+      "ldr d27, [x28], #0x8\n"
+      "ldr d26, [x27], #0x8\n"
       "fcvtl v27.4s, v27.4h\n"
+      "fcvtl v26.4s, v26.4h\n"
       "ldr d22, [x26], #0x8\n"
       "ldr d21, [x25], #0x8\n"
       "fcvtl v22.4s, v22.4h\n"
       "fcvtl v21.4s, v21.4h\n"
-      "ldr d26, [x24], #0x8\n"
+      "ldr d20, [x24], #0x8\n"
       "ldr d25, [x23], #0x8\n"
-      "fcvtl v26.4s, v26.4h\n"
-      "fcvtl v25.4s, v25.4h\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d19, [x21], #0x8\n"
       "fcvtl v20.4s, v20.4h\n"
+      "fcvtl v25.4s, v25.4h\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d16, [x21], #0x8\n"
       "fcvtl v19.4s, v19.4h\n"
-      "zip1 v24.4s, v28.4s, v22.4s\n"
-      "zip1 v23.4s, v27.4s, v21.4s\n"
+      "fcvtl v16.4s, v16.4h\n"
+      "zip1 v24.4s, v27.4s, v22.4s\n"
+      "zip1 v23.4s, v26.4s, v21.4s\n"
       "subs %x[width], %x[width], #0x4\n"
       "cmp %x[width], #0x4\n"
-      "zip1 v18.4s, v26.4s, v20.4s\n"
-      "zip1 v17.4s, v25.4s, v19.4s\n"
+      "zip1 v18.4s, v20.4s, v19.4s\n"
+      "zip1 v17.4s, v25.4s, v16.4s\n"
       "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip2 v22.4s, v28.4s, v22.4s\n"
-      "zip2 v21.4s, v27.4s, v21.4s\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "zip2 v21.4s, v26.4s, v21.4s\n"
       "prfm pldl1keep, [x26, #0x70]\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip2 v20.4s, v26.4s, v20.4s\n"
-      "zip2 v19.4s, v25.4s, v19.4s\n"
+      "zip2 v20.4s, v20.4s, v19.4s\n"
+      "zip2 v19.4s, v25.4s, v16.4s\n"
       "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
       "prfm pldl1keep, [x22, #0x70]\n"
@@ -137,71 +137,70 @@
       "ldr s28, [x28], #0x4\n"
       "ldr s27, [x27], #0x4\n"
       "mov x20, #0x2\n"
-      "ldr s22, [x26], #0x4\n"
-      "ldr s21, [x25], #0x4\n"
-      "ldr s26, [x24], #0x4\n"
-      "ldr s25, [x23], #0x4\n"
-      "ldr s20, [x22], #0x4\n"
-      "ldr s19, [x21], #0x4\n"
+      "ldr s26, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s24, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s22, [x22], #0x4\n"
+      "ldr s21, [x21], #0x4\n"
       "tbz %x[width], #0, 5f\n"
       "ld1 { v28.h }[2], [x28]\n"
       "ld1 { v27.h }[2], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v22.h }[2], [x26]\n"
-      "ld1 { v21.h }[2], [x25]\n"
-      "ld1 { v26.h }[2], [x24]\n"
-      "ld1 { v25.h }[2], [x23]\n"
-      "ld1 { v20.h }[2], [x22]\n"
-      "ld1 { v19.h }[2], [x21]\n"
+      "ld1 { v26.h }[2], [x26]\n"
+      "ld1 { v25.h }[2], [x25]\n"
+      "ld1 { v24.h }[2], [x24]\n"
+      "ld1 { v23.h }[2], [x23]\n"
+      "ld1 { v22.h }[2], [x22]\n"
+      "ld1 { v21.h }[2], [x21]\n"
       "b 5f\n"
       "4:"  // odd_loads_1_0
       "ldr h28, [x28, #0x0]\n"
       "ldr h27, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr h22, [x26, #0x0]\n"
-      "ldr h21, [x25, #0x0]\n"
-      "ldr h26, [x24, #0x0]\n"
-      "ldr h25, [x23, #0x0]\n"
-      "ldr h20, [x22, #0x0]\n"
-      "ldr h19, [x21, #0x0]\n"
+      "ldr h26, [x26, #0x0]\n"
+      "ldr h25, [x25, #0x0]\n"
+      "ldr h24, [x24, #0x0]\n"
+      "ldr h23, [x23, #0x0]\n"
+      "ldr h22, [x22, #0x0]\n"
+      "ldr h21, [x21, #0x0]\n"
       "5:"  // Odd load end
       "fcvtl v28.4s, v28.4h\n"
       "fcvtl v27.4s, v27.4h\n"
       "subs x20, x20, #0x1\n"
-      "fcvtl v22.4s, v22.4h\n"
-      "fcvtl v21.4s, v21.4h\n"
       "fcvtl v26.4s, v26.4h\n"
       "fcvtl v25.4s, v25.4h\n"
-      "fcvtl v20.4s, v20.4h\n"
-      "fcvtl v19.4s, v19.4h\n"
-      "zip1 v24.4s, v28.4s, v22.4s\n"
-      "zip1 v23.4s, v27.4s, v21.4s\n"
-      "zip1 v18.4s, v26.4s, v20.4s\n"
-      "zip1 v17.4s, v25.4s, v19.4s\n"
-      "zip1 v16.4s, v24.4s, v23.4s\n"
+      "fcvtl v24.4s, v24.4h\n"
+      "fcvtl v23.4s, v23.4h\n"
+      "fcvtl v22.4s, v22.4h\n"
+      "fcvtl v21.4s, v21.4h\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "zip1 v19.4s, v27.4s, v25.4s\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "zip1 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 6f\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v16.4s, v24.4s, v23.4s\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip2 v17.4s, v18.4s, v17.4s\n"
-      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 6f\n"
-      "zip2 v22.4s, v28.4s, v22.4s\n"
-      "zip2 v21.4s, v27.4s, v21.4s\n"
-      "zip2 v20.4s, v26.4s, v20.4s\n"
-      "zip2 v19.4s, v25.4s, v19.4s\n"
-      "zip1 v16.4s, v22.4s, v21.4s\n"
+      "zip2 v19.4s, v28.4s, v26.4s\n"
+      "zip2 v16.4s, v27.4s, v25.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v19.4s, v16.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip1 v18.4s, v20.4s, v19.4s\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "6:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
       : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
index 6c009b3..80c387d 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
@@ -79,29 +79,29 @@
       "prfm pldl1keep, [x21, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q28, [x28], #0x10\n"
-      "ldr q27, [x27], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "ldr q18, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x4\n"
       "cmp %x[width], #0x4\n"
-      "ldr q22, [x26], #0x10\n"
-      "ldr q21, [x25], #0x10\n"
-      "zip1 v26.4s, v28.4s, v22.4s\n"
-      "zip1 v25.4s, v27.4s, v21.4s\n"
-      "ldr q24, [x24], #0x10\n"
+      "ldr q17, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v25.4s, v20.4s, v17.4s\n"
+      "zip1 v24.4s, v18.4s, v16.4s\n"
+      "ldr q19, [x24], #0x10\n"
       "ldr q23, [x23], #0x10\n"
-      "zip2 v22.4s, v28.4s, v22.4s\n"
-      "zip2 v21.4s, v27.4s, v21.4s\n"
-      "ldr q19, [x22], #0x10\n"
-      "ldr q18, [x21], #0x10\n"
-      "zip1 v20.4s, v24.4s, v19.4s\n"
-      "zip1 v17.4s, v23.4s, v18.4s\n"
-      "zip2 v19.4s, v24.4s, v19.4s\n"
-      "zip2 v18.4s, v23.4s, v18.4s\n"
+      "zip2 v22.4s, v20.4s, v17.4s\n"
+      "zip2 v21.4s, v18.4s, v16.4s\n"
+      "ldr q18, [x22], #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      "zip1 v20.4s, v19.4s, v18.4s\n"
+      "zip1 v17.4s, v23.4s, v16.4s\n"
+      "zip2 v19.4s, v19.4s, v18.4s\n"
+      "zip2 v18.4s, v23.4s, v16.4s\n"
       "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
       "prfm pldl1keep, [x26, #0x70]\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v16.4s, v26.4s, v25.4s\n"
+      "zip1 v16.4s, v25.4s, v24.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
@@ -109,7 +109,7 @@
       "str q16, [%x[out_ptr], #0x10]\n"
       "prfm pldl1keep, [x22, #0x70]\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "zip2 v16.4s, v26.4s, v25.4s\n"
+      "zip2 v16.4s, v25.4s, v24.4s\n"
       "str q16, [%x[out_ptr], #0x20]\n"
       "zip2 v16.4s, v20.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x30]\n"
@@ -129,63 +129,62 @@
       "ldr d28, [x28], #0x8\n"
       "ldr d27, [x27], #0x8\n"
       "mov x20, #0x2\n"
-      "ldr d22, [x26], #0x8\n"
-      "ldr d21, [x25], #0x8\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
       "ldr d24, [x24], #0x8\n"
       "ldr d23, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d18, [x21], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d21, [x21], #0x8\n"
       "tbz %x[width], #0, 5f\n"
       "ld1 { v28.s }[2], [x28]\n"
       "ld1 { v27.s }[2], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v22.s }[2], [x26]\n"
-      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x26]\n"
+      "ld1 { v25.s }[2], [x25]\n"
       "ld1 { v24.s }[2], [x24]\n"
       "ld1 { v23.s }[2], [x23]\n"
-      "ld1 { v19.s }[2], [x22]\n"
-      "ld1 { v18.s }[2], [x21]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v21.s }[2], [x21]\n"
       "b 5f\n"
       "4:"  // odd_loads_1_0
       "ldr s28, [x28, #0x0]\n"
       "ldr s27, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr s22, [x26, #0x0]\n"
-      "ldr s21, [x25, #0x0]\n"
+      "ldr s26, [x26, #0x0]\n"
+      "ldr s25, [x25, #0x0]\n"
       "ldr s24, [x24, #0x0]\n"
       "ldr s23, [x23, #0x0]\n"
-      "ldr s19, [x22, #0x0]\n"
-      "ldr s18, [x21, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s21, [x21, #0x0]\n"
       "5:"  // Odd load end
-      "zip1 v26.4s, v28.4s, v22.4s\n"
-      "zip1 v25.4s, v27.4s, v21.4s\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "zip1 v19.4s, v27.4s, v25.4s\n"
       "subs x20, x20, #0x1\n"
-      "zip1 v20.4s, v24.4s, v19.4s\n"
-      "zip1 v17.4s, v23.4s, v18.4s\n"
-      "zip1 v16.4s, v26.4s, v25.4s\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip1 v16.4s, v20.4s, v17.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 6f\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v16.4s, v26.4s, v25.4s\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip2 v16.4s, v20.4s, v17.4s\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 6f\n"
-      "zip2 v22.4s, v28.4s, v22.4s\n"
-      "zip2 v21.4s, v27.4s, v21.4s\n"
-      "zip2 v19.4s, v24.4s, v19.4s\n"
-      "zip2 v18.4s, v23.4s, v18.4s\n"
-      "zip1 v16.4s, v22.4s, v21.4s\n"
+      "zip2 v19.4s, v28.4s, v26.4s\n"
+      "zip2 v16.4s, v27.4s, v25.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v19.4s, v16.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "6:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
       : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
index 767d468..8e06b7e 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
@@ -80,33 +80,33 @@
       "blt 3f\n"
       "2:"  // Main loop head
       "ldr q25, [x28], #0x10\n"
-      "ldr q30, [x27], #0x10\n"
+      "ldr q27, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x8\n"
       "cmp %x[width], #0x8\n"
-      "ldr q29, [x26], #0x10\n"
-      "ldr q28, [x25], #0x10\n"
+      "ldr q26, [x26], #0x10\n"
+      "ldr q24, [x25], #0x10\n"
       "ldr q21, [x24], #0x10\n"
-      "ldr q27, [x23], #0x10\n"
+      "ldr q20, [x23], #0x10\n"
       "zip1 v23.8h, v25.8h, v21.8h\n"
-      "zip1 v26.8h, v30.8h, v27.8h\n"
-      "ldr q20, [x22], #0x10\n"
-      "ldr q22, [x21], #0x10\n"
-      "zip1 v19.8h, v29.8h, v20.8h\n"
-      "zip1 v18.8h, v28.8h, v22.8h\n"
+      "zip1 v22.8h, v27.8h, v20.8h\n"
+      "ldr q17, [x22], #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      "zip1 v19.8h, v26.8h, v17.8h\n"
+      "zip1 v18.8h, v24.8h, v16.8h\n"
       "zip2 v25.8h, v25.8h, v21.8h\n"
-      "zip2 v21.8h, v29.8h, v20.8h\n"
+      "zip2 v21.8h, v26.8h, v17.8h\n"
       "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
-      "zip2 v20.8h, v30.8h, v27.8h\n"
-      "zip2 v16.8h, v28.8h, v22.8h\n"
+      "zip2 v20.8h, v27.8h, v20.8h\n"
+      "zip2 v16.8h, v24.8h, v16.8h\n"
       "prfm pldl1keep, [x26, #0x70]\n"
       "prfm pldl1keep, [x25, #0x70]\n"
       "zip1 v24.8h, v23.8h, v19.8h\n"
-      "zip1 v17.8h, v26.8h, v18.8h\n"
+      "zip1 v17.8h, v22.8h, v18.8h\n"
       "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
       "zip2 v23.8h, v23.8h, v19.8h\n"
-      "zip2 v19.8h, v26.8h, v18.8h\n"
+      "zip2 v19.8h, v22.8h, v18.8h\n"
       "prfm pldl1keep, [x22, #0x70]\n"
       "prfm pldl1keep, [x21, #0x70]\n"
       "zip1 v22.8h, v25.8h, v21.8h\n"
@@ -134,132 +134,131 @@
       "3:"  // Main loop skip
       "cbz %x[width], 8f\n"
       "tbz %x[width], #2, 5f\n"
-      "ldr d25, [x28], #0x8\n"
-      "ldr d30, [x27], #0x8\n"
-      "ldr d29, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d21, [x24], #0x8\n"
-      "ldr d27, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d22, [x21], #0x8\n"
+      "ldr d30, [x28], #0x8\n"
+      "ldr d29, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d26, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v25.s }[2], [x28], #0x4\n"
-      "ld1 { v30.s }[2], [x27], #0x4\n"
+      "ld1 { v30.s }[2], [x28], #0x4\n"
+      "ld1 { v29.s }[2], [x27], #0x4\n"
       "mov x20, #0x6\n"
-      "ld1 { v29.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v21.s }[2], [x24], #0x4\n"
-      "ld1 { v27.s }[2], [x23], #0x4\n"
-      "ld1 { v20.s }[2], [x22], #0x4\n"
-      "ld1 { v22.s }[2], [x21], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v27.s }[2], [x25], #0x4\n"
+      "ld1 { v26.s }[2], [x24], #0x4\n"
+      "ld1 { v25.s }[2], [x23], #0x4\n"
+      "ld1 { v24.s }[2], [x22], #0x4\n"
+      "ld1 { v23.s }[2], [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v25.h }[6], [x28]\n"
-      "ld1 { v30.h }[6], [x27]\n"
+      "ld1 { v30.h }[6], [x28]\n"
+      "ld1 { v29.h }[6], [x27]\n"
       "mov x20, #0x7\n"
-      "ld1 { v29.h }[6], [x26]\n"
-      "ld1 { v28.h }[6], [x25]\n"
-      "ld1 { v21.h }[6], [x24]\n"
-      "ld1 { v27.h }[6], [x23]\n"
-      "ld1 { v20.h }[6], [x22]\n"
-      "ld1 { v22.h }[6], [x21]\n"
+      "ld1 { v28.h }[6], [x26]\n"
+      "ld1 { v27.h }[6], [x25]\n"
+      "ld1 { v26.h }[6], [x24]\n"
+      "ld1 { v25.h }[6], [x23]\n"
+      "ld1 { v24.h }[6], [x22]\n"
+      "ld1 { v23.h }[6], [x21]\n"
       "b 7f\n"
       "4:"  // odd_loads_1_4
       "mov x20, #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v25.h }[4], [x28]\n"
-      "ld1 { v30.h }[4], [x27]\n"
+      "ld1 { v30.h }[4], [x28]\n"
+      "ld1 { v29.h }[4], [x27]\n"
       "mov x20, #0x5\n"
-      "ld1 { v29.h }[4], [x26]\n"
-      "ld1 { v28.h }[4], [x25]\n"
-      "ld1 { v21.h }[4], [x24]\n"
-      "ld1 { v27.h }[4], [x23]\n"
-      "ld1 { v20.h }[4], [x22]\n"
-      "ld1 { v22.h }[4], [x21]\n"
+      "ld1 { v28.h }[4], [x26]\n"
+      "ld1 { v27.h }[4], [x25]\n"
+      "ld1 { v26.h }[4], [x24]\n"
+      "ld1 { v25.h }[4], [x23]\n"
+      "ld1 { v24.h }[4], [x22]\n"
+      "ld1 { v23.h }[4], [x21]\n"
       "b 7f\n"
       "5:"  // odd_loads_2_0
       "tbz %x[width], #1, 6f\n"
-      "ldr s25, [x28], #0x4\n"
-      "ldr s30, [x27], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
       "mov x20, #0x2\n"
-      "ldr s29, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s21, [x24], #0x4\n"
-      "ldr s27, [x23], #0x4\n"
-      "ldr s20, [x22], #0x4\n"
-      "ldr s22, [x21], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v25.h }[2], [x28]\n"
-      "ld1 { v30.h }[2], [x27]\n"
+      "ld1 { v30.h }[2], [x28]\n"
+      "ld1 { v29.h }[2], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v29.h }[2], [x26]\n"
-      "ld1 { v28.h }[2], [x25]\n"
-      "ld1 { v21.h }[2], [x24]\n"
-      "ld1 { v27.h }[2], [x23]\n"
-      "ld1 { v20.h }[2], [x22]\n"
-      "ld1 { v22.h }[2], [x21]\n"
+      "ld1 { v28.h }[2], [x26]\n"
+      "ld1 { v27.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x24]\n"
+      "ld1 { v25.h }[2], [x23]\n"
+      "ld1 { v24.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
       "b 7f\n"
       "6:"  // odd_loads_1_0
-      "ldr h25, [x28, #0x0]\n"
-      "ldr h30, [x27, #0x0]\n"
+      "ldr h30, [x28, #0x0]\n"
+      "ldr h29, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr h29, [x26, #0x0]\n"
-      "ldr h28, [x25, #0x0]\n"
-      "ldr h21, [x24, #0x0]\n"
-      "ldr h27, [x23, #0x0]\n"
-      "ldr h20, [x22, #0x0]\n"
-      "ldr h22, [x21, #0x0]\n"
+      "ldr h28, [x26, #0x0]\n"
+      "ldr h27, [x25, #0x0]\n"
+      "ldr h26, [x24, #0x0]\n"
+      "ldr h25, [x23, #0x0]\n"
+      "ldr h24, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
       "7:"  // Odd load end
-      "zip1 v23.8h, v25.8h, v21.8h\n"
-      "zip1 v19.8h, v29.8h, v20.8h\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
       "subs x20, x20, #0x1\n"
-      "zip1 v26.8h, v30.8h, v27.8h\n"
-      "zip1 v18.8h, v28.8h, v22.8h\n"
-      "zip1 v24.8h, v23.8h, v19.8h\n"
-      "zip1 v17.8h, v26.8h, v18.8h\n"
-      "zip1 v16.8h, v24.8h, v17.8h\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v16.8h, v24.8h, v17.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v23.8h, v23.8h, v19.8h\n"
-      "zip2 v19.8h, v26.8h, v18.8h\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
       "subs x20, x20, #0x1\n"
-      "zip1 v17.8h, v23.8h, v19.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 8f\n"
-      "subs x20, x20, #0x1\n"
-      "zip2 v16.8h, v23.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v25.8h, v25.8h, v21.8h\n"
-      "zip2 v21.8h, v29.8h, v20.8h\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v20.8h, v30.8h, v27.8h\n"
-      "zip2 v16.8h, v28.8h, v22.8h\n"
-      "zip1 v22.8h, v25.8h, v21.8h\n"
-      "zip1 v18.8h, v20.8h, v16.8h\n"
-      "zip1 v19.8h, v22.8h, v18.8h\n"
-      "str q19, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v18.8h, v22.8h, v18.8h\n"
-      "str q18, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v21.8h, v25.8h, v21.8h\n"
-      "zip2 v20.8h, v20.8h, v16.8h\n"
-      "zip1 v17.8h, v21.8h, v20.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "8:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
       : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
index a737920..b91ae8a 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
@@ -159,118 +159,86 @@
       "5:"  // Main loop skip
       "cbz %x[width], 10f\n"
       "tbz %x[width], #2, 7f\n"
-      "ldr d31, [x28], #0x8\n"
-      "ldr d30, [x27], #0x8\n"
-      "ldr d29, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d27, [x24], #0x8\n"
-      "ldr d26, [x23], #0x8\n"
+      "ldr d30, [x28], #0x8\n"
+      "ldr d29, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d26, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
       "ldr d24, [x22], #0x8\n"
       "ldr d23, [x21], #0x8\n"
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v31.s }[2], [x28], #0x4\n"
-      "ld1 { v30.s }[2], [x27], #0x4\n"
+      "ld1 { v30.s }[2], [x28], #0x4\n"
+      "ld1 { v29.s }[2], [x27], #0x4\n"
       "mov x20, #0x6\n"
-      "ld1 { v29.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v27.s }[2], [x24], #0x4\n"
-      "ld1 { v26.s }[2], [x23], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v27.s }[2], [x25], #0x4\n"
+      "ld1 { v26.s }[2], [x24], #0x4\n"
+      "ld1 { v25.s }[2], [x23], #0x4\n"
       "ld1 { v24.s }[2], [x22], #0x4\n"
       "ld1 { v23.s }[2], [x21], #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v31.h }[6], [x28]\n"
-      "ld1 { v30.h }[6], [x27]\n"
+      "ld1 { v30.h }[6], [x28]\n"
+      "ld1 { v29.h }[6], [x27]\n"
       "mov x20, #0x7\n"
-      "ld1 { v29.h }[6], [x26]\n"
-      "ld1 { v28.h }[6], [x25]\n"
-      "ld1 { v27.h }[6], [x24]\n"
-      "ld1 { v26.h }[6], [x23]\n"
+      "ld1 { v28.h }[6], [x26]\n"
+      "ld1 { v27.h }[6], [x25]\n"
+      "ld1 { v26.h }[6], [x24]\n"
+      "ld1 { v25.h }[6], [x23]\n"
       "ld1 { v24.h }[6], [x22]\n"
       "ld1 { v23.h }[6], [x21]\n"
       "b 9f\n"
       "6:"  // odd_loads_1_4
       "mov x20, #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v31.h }[4], [x28]\n"
-      "ld1 { v30.h }[4], [x27]\n"
+      "ld1 { v30.h }[4], [x28]\n"
+      "ld1 { v29.h }[4], [x27]\n"
       "mov x20, #0x5\n"
-      "ld1 { v29.h }[4], [x26]\n"
-      "ld1 { v28.h }[4], [x25]\n"
-      "ld1 { v27.h }[4], [x24]\n"
-      "ld1 { v26.h }[4], [x23]\n"
+      "ld1 { v28.h }[4], [x26]\n"
+      "ld1 { v27.h }[4], [x25]\n"
+      "ld1 { v26.h }[4], [x24]\n"
+      "ld1 { v25.h }[4], [x23]\n"
       "ld1 { v24.h }[4], [x22]\n"
       "ld1 { v23.h }[4], [x21]\n"
       "b 9f\n"
       "7:"  // odd_loads_2_0
       "tbz %x[width], #1, 8f\n"
-      "ldr s31, [x28], #0x4\n"
-      "ldr s30, [x27], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
       "mov x20, #0x2\n"
-      "ldr s29, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s27, [x24], #0x4\n"
-      "ldr s26, [x23], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
       "ldr s24, [x22], #0x4\n"
       "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v31.h }[2], [x28]\n"
-      "ld1 { v30.h }[2], [x27]\n"
+      "ld1 { v30.h }[2], [x28]\n"
+      "ld1 { v29.h }[2], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v29.h }[2], [x26]\n"
-      "ld1 { v28.h }[2], [x25]\n"
-      "ld1 { v27.h }[2], [x24]\n"
-      "ld1 { v26.h }[2], [x23]\n"
+      "ld1 { v28.h }[2], [x26]\n"
+      "ld1 { v27.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x24]\n"
+      "ld1 { v25.h }[2], [x23]\n"
       "ld1 { v24.h }[2], [x22]\n"
       "ld1 { v23.h }[2], [x21]\n"
       "b 9f\n"
       "8:"  // odd_loads_1_0
-      "ldr h31, [x28, #0x0]\n"
-      "ldr h30, [x27, #0x0]\n"
+      "ldr h30, [x28, #0x0]\n"
+      "ldr h29, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr h29, [x26, #0x0]\n"
-      "ldr h28, [x25, #0x0]\n"
-      "ldr h27, [x24, #0x0]\n"
-      "ldr h26, [x23, #0x0]\n"
+      "ldr h28, [x26, #0x0]\n"
+      "ldr h27, [x25, #0x0]\n"
+      "ldr h26, [x24, #0x0]\n"
+      "ldr h25, [x23, #0x0]\n"
       "ldr h24, [x22, #0x0]\n"
       "ldr h23, [x21, #0x0]\n"
       "9:"  // Odd load end
-      "zip1 v25.8h, v31.8h, v27.8h\n"
-      "zip1 v18.8h, v29.8h, v24.8h\n"
-      "subs x20, x20, #0x1\n"
       "zip1 v22.8h, v30.8h, v26.8h\n"
-      "zip1 v21.8h, v28.8h, v23.8h\n"
-      "zip1 v17.8h, v25.8h, v18.8h\n"
-      "zip1 v16.8h, v22.8h, v21.8h\n"
-      "zip1 v20.8h, v17.8h, v16.8h\n"
-      "str q20, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v20.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v19.8h, v17.8h, v16.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
       "subs x20, x20, #0x1\n"
-      "str q19, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v19.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v18.8h, v25.8h, v18.8h\n"
-      "zip2 v17.8h, v22.8h, v21.8h\n"
-      "subs x20, x20, #0x1\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v16.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v16.8h, v18.8h, v17.8h\n"
-      "subs x20, x20, #0x1\n"
-      "str q16, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v16.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v22.8h, v31.8h, v27.8h\n"
-      "zip2 v21.8h, v29.8h, v24.8h\n"
-      "subs x20, x20, #0x1\n"
-      "zip2 v20.8h, v30.8h, v26.8h\n"
-      "zip2 v19.8h, v28.8h, v23.8h\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v23.8h\n"
       "zip1 v18.8h, v22.8h, v21.8h\n"
       "zip1 v17.8h, v20.8h, v19.8h\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
@@ -286,10 +254,42 @@
       "beq 10f\n"
       "zip2 v18.8h, v22.8h, v21.8h\n"
       "zip2 v17.8h, v20.8h, v19.8h\n"
+      "subs x20, x20, #0x1\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "10:"  // Odds skip
       "saddw v1.4s, v1.4s, v2.4h\n"
       "saddw2 v0.4s, v0.4s, v2.8h\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
index 4a38187..c41120c 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
@@ -80,35 +80,35 @@
       "blt 3f\n"
       "2:"  // Main loop head
       "ldr d25, [x28], #0x8\n"
-      "ldr d30, [x27], #0x8\n"
+      "ldr d27, [x27], #0x8\n"
       "sshll v25.8h, v25.8b, #0x0\n"
-      "sshll v30.8h, v30.8b, #0x0\n"
-      "ldr d29, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "sshll v29.8h, v29.8b, #0x0\n"
-      "sshll v28.8h, v28.8b, #0x0\n"
-      "ldr d21, [x24], #0x8\n"
-      "ldr d27, [x23], #0x8\n"
-      "sshll v21.8h, v21.8b, #0x0\n"
       "sshll v27.8h, v27.8b, #0x0\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d26, [x21], #0x8\n"
-      "sshll v20.8h, v20.8b, #0x0\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d24, [x25], #0x8\n"
       "sshll v26.8h, v26.8b, #0x0\n"
+      "sshll v24.8h, v24.8b, #0x0\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "sshll v21.8h, v21.8b, #0x0\n"
+      "sshll v20.8h, v20.8b, #0x0\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x21], #0x8\n"
+      "sshll v17.8h, v17.8b, #0x0\n"
+      "sshll v16.8h, v16.8b, #0x0\n"
       "zip1 v23.8h, v25.8h, v21.8h\n"
-      "zip1 v22.8h, v29.8h, v20.8h\n"
+      "zip1 v22.8h, v26.8h, v17.8h\n"
       "subs %x[width], %x[width], #0x8\n"
       "cmp %x[width], #0x8\n"
-      "zip1 v19.8h, v30.8h, v27.8h\n"
-      "zip1 v18.8h, v28.8h, v26.8h\n"
+      "zip1 v19.8h, v27.8h, v20.8h\n"
+      "zip1 v18.8h, v24.8h, v16.8h\n"
       "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
       "zip2 v25.8h, v25.8h, v21.8h\n"
-      "zip2 v21.8h, v29.8h, v20.8h\n"
+      "zip2 v21.8h, v26.8h, v17.8h\n"
       "prfm pldl1keep, [x26, #0x70]\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip2 v20.8h, v30.8h, v27.8h\n"
-      "zip2 v16.8h, v28.8h, v26.8h\n"
+      "zip2 v20.8h, v27.8h, v20.8h\n"
+      "zip2 v16.8h, v24.8h, v16.8h\n"
       "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
       "zip1 v24.8h, v23.8h, v22.8h\n"
@@ -142,140 +142,139 @@
       "3:"  // Main loop skip
       "cbz %x[width], 8f\n"
       "tbz %x[width], #2, 5f\n"
-      "ldr s25, [x28], #0x4\n"
-      "ldr s30, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s21, [x24], #0x4\n"
-      "ldr s27, [x23], #0x4\n"
-      "ldr s20, [x22], #0x4\n"
-      "ldr s26, [x21], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v25.h }[2], [x28], #0x2\n"
-      "ld1 { v30.h }[2], [x27], #0x2\n"
+      "ld1 { v30.h }[2], [x28], #0x2\n"
+      "ld1 { v29.h }[2], [x27], #0x2\n"
       "mov x20, #0x6\n"
-      "ld1 { v29.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "ld1 { v21.h }[2], [x24], #0x2\n"
-      "ld1 { v27.h }[2], [x23], #0x2\n"
-      "ld1 { v20.h }[2], [x22], #0x2\n"
-      "ld1 { v26.h }[2], [x21], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v27.h }[2], [x25], #0x2\n"
+      "ld1 { v26.h }[2], [x24], #0x2\n"
+      "ld1 { v25.h }[2], [x23], #0x2\n"
+      "ld1 { v24.h }[2], [x22], #0x2\n"
+      "ld1 { v23.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v25.b }[6], [x28]\n"
-      "ld1 { v30.b }[6], [x27]\n"
+      "ld1 { v30.b }[6], [x28]\n"
+      "ld1 { v29.b }[6], [x27]\n"
       "mov x20, #0x7\n"
-      "ld1 { v29.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
-      "ld1 { v21.b }[6], [x24]\n"
-      "ld1 { v27.b }[6], [x23]\n"
-      "ld1 { v20.b }[6], [x22]\n"
-      "ld1 { v26.b }[6], [x21]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v27.b }[6], [x25]\n"
+      "ld1 { v26.b }[6], [x24]\n"
+      "ld1 { v25.b }[6], [x23]\n"
+      "ld1 { v24.b }[6], [x22]\n"
+      "ld1 { v23.b }[6], [x21]\n"
       "b 7f\n"
       "4:"  // odd_loads_1_4
       "mov x20, #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v25.b }[4], [x28]\n"
-      "ld1 { v30.b }[4], [x27]\n"
+      "ld1 { v30.b }[4], [x28]\n"
+      "ld1 { v29.b }[4], [x27]\n"
       "mov x20, #0x5\n"
-      "ld1 { v29.b }[4], [x26]\n"
-      "ld1 { v28.b }[4], [x25]\n"
-      "ld1 { v21.b }[4], [x24]\n"
-      "ld1 { v27.b }[4], [x23]\n"
-      "ld1 { v20.b }[4], [x22]\n"
-      "ld1 { v26.b }[4], [x21]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v27.b }[4], [x25]\n"
+      "ld1 { v26.b }[4], [x24]\n"
+      "ld1 { v25.b }[4], [x23]\n"
+      "ld1 { v24.b }[4], [x22]\n"
+      "ld1 { v23.b }[4], [x21]\n"
       "b 7f\n"
       "5:"  // odd_loads_2_0
       "tbz %x[width], #1, 6f\n"
-      "ldr h25, [x28], #0x2\n"
-      "ldr h30, [x27], #0x2\n"
+      "ldr h30, [x28], #0x2\n"
+      "ldr h29, [x27], #0x2\n"
       "mov x20, #0x2\n"
-      "ldr h29, [x26], #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "ldr h21, [x24], #0x2\n"
-      "ldr h27, [x23], #0x2\n"
-      "ldr h20, [x22], #0x2\n"
-      "ldr h26, [x21], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h27, [x25], #0x2\n"
+      "ldr h26, [x24], #0x2\n"
+      "ldr h25, [x23], #0x2\n"
+      "ldr h24, [x22], #0x2\n"
+      "ldr h23, [x21], #0x2\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v25.b }[2], [x28]\n"
-      "ld1 { v30.b }[2], [x27]\n"
+      "ld1 { v30.b }[2], [x28]\n"
+      "ld1 { v29.b }[2], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v29.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
-      "ld1 { v21.b }[2], [x24]\n"
-      "ld1 { v27.b }[2], [x23]\n"
-      "ld1 { v20.b }[2], [x22]\n"
-      "ld1 { v26.b }[2], [x21]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v27.b }[2], [x25]\n"
+      "ld1 { v26.b }[2], [x24]\n"
+      "ld1 { v25.b }[2], [x23]\n"
+      "ld1 { v24.b }[2], [x22]\n"
+      "ld1 { v23.b }[2], [x21]\n"
       "b 7f\n"
       "6:"  // odd_loads_1_0
-      "ldr b25, [x28, #0x0]\n"
-      "ldr b30, [x27, #0x0]\n"
+      "ldr b30, [x28, #0x0]\n"
+      "ldr b29, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr b29, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
-      "ldr b21, [x24, #0x0]\n"
-      "ldr b27, [x23, #0x0]\n"
-      "ldr b20, [x22, #0x0]\n"
-      "ldr b26, [x21, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b27, [x25, #0x0]\n"
+      "ldr b26, [x24, #0x0]\n"
+      "ldr b25, [x23, #0x0]\n"
+      "ldr b24, [x22, #0x0]\n"
+      "ldr b23, [x21, #0x0]\n"
       "7:"  // Odd load end
-      "sshll v25.8h, v25.8b, #0x0\n"
       "sshll v30.8h, v30.8b, #0x0\n"
-      "subs x20, x20, #0x1\n"
       "sshll v29.8h, v29.8b, #0x0\n"
+      "subs x20, x20, #0x1\n"
       "sshll v28.8h, v28.8b, #0x0\n"
-      "sshll v21.8h, v21.8b, #0x0\n"
       "sshll v27.8h, v27.8b, #0x0\n"
-      "sshll v20.8h, v20.8b, #0x0\n"
       "sshll v26.8h, v26.8b, #0x0\n"
-      "zip1 v23.8h, v25.8h, v21.8h\n"
-      "zip1 v22.8h, v29.8h, v20.8h\n"
-      "zip1 v19.8h, v30.8h, v27.8h\n"
-      "zip1 v18.8h, v28.8h, v26.8h\n"
-      "zip1 v24.8h, v23.8h, v22.8h\n"
-      "zip1 v17.8h, v19.8h, v18.8h\n"
-      "zip1 v16.8h, v24.8h, v17.8h\n"
+      "sshll v25.8h, v25.8b, #0x0\n"
+      "sshll v24.8h, v24.8b, #0x0\n"
+      "sshll v23.8h, v23.8b, #0x0\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v16.8h, v24.8h, v17.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v23.8h, v23.8h, v22.8h\n"
-      "zip2 v19.8h, v19.8h, v18.8h\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
       "subs x20, x20, #0x1\n"
-      "zip1 v17.8h, v23.8h, v19.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 8f\n"
-      "subs x20, x20, #0x1\n"
-      "zip2 v16.8h, v23.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v25.8h, v25.8h, v21.8h\n"
-      "zip2 v21.8h, v29.8h, v20.8h\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v20.8h, v30.8h, v27.8h\n"
-      "zip2 v16.8h, v28.8h, v26.8h\n"
-      "zip1 v22.8h, v25.8h, v21.8h\n"
-      "zip1 v18.8h, v20.8h, v16.8h\n"
-      "zip1 v19.8h, v22.8h, v18.8h\n"
-      "str q19, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v18.8h, v22.8h, v18.8h\n"
-      "str q18, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v21.8h, v25.8h, v21.8h\n"
-      "zip2 v20.8h, v20.8h, v16.8h\n"
-      "zip1 v17.8h, v21.8h, v20.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "8:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
       : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
index 3ad103c..9ac7053 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
@@ -167,126 +167,94 @@
       "5:"  // Main loop skip
       "cbz %x[width], 10f\n"
       "tbz %x[width], #2, 7f\n"
-      "ldr s31, [x28], #0x4\n"
-      "ldr s30, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s27, [x24], #0x4\n"
-      "ldr s26, [x23], #0x4\n"
-      "ldr s25, [x22], #0x4\n"
-      "ldr s24, [x21], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v31.h }[2], [x28], #0x2\n"
-      "ld1 { v30.h }[2], [x27], #0x2\n"
+      "ld1 { v30.h }[2], [x28], #0x2\n"
+      "ld1 { v29.h }[2], [x27], #0x2\n"
       "mov x20, #0x6\n"
-      "ld1 { v29.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "ld1 { v27.h }[2], [x24], #0x2\n"
-      "ld1 { v26.h }[2], [x23], #0x2\n"
-      "ld1 { v25.h }[2], [x22], #0x2\n"
-      "ld1 { v24.h }[2], [x21], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v27.h }[2], [x25], #0x2\n"
+      "ld1 { v26.h }[2], [x24], #0x2\n"
+      "ld1 { v25.h }[2], [x23], #0x2\n"
+      "ld1 { v24.h }[2], [x22], #0x2\n"
+      "ld1 { v23.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v31.b }[6], [x28]\n"
-      "ld1 { v30.b }[6], [x27]\n"
+      "ld1 { v30.b }[6], [x28]\n"
+      "ld1 { v29.b }[6], [x27]\n"
       "mov x20, #0x7\n"
-      "ld1 { v29.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
-      "ld1 { v27.b }[6], [x24]\n"
-      "ld1 { v26.b }[6], [x23]\n"
-      "ld1 { v25.b }[6], [x22]\n"
-      "ld1 { v24.b }[6], [x21]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v27.b }[6], [x25]\n"
+      "ld1 { v26.b }[6], [x24]\n"
+      "ld1 { v25.b }[6], [x23]\n"
+      "ld1 { v24.b }[6], [x22]\n"
+      "ld1 { v23.b }[6], [x21]\n"
       "b 9f\n"
       "6:"  // odd_loads_1_4
       "mov x20, #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v31.b }[4], [x28]\n"
-      "ld1 { v30.b }[4], [x27]\n"
+      "ld1 { v30.b }[4], [x28]\n"
+      "ld1 { v29.b }[4], [x27]\n"
       "mov x20, #0x5\n"
-      "ld1 { v29.b }[4], [x26]\n"
-      "ld1 { v28.b }[4], [x25]\n"
-      "ld1 { v27.b }[4], [x24]\n"
-      "ld1 { v26.b }[4], [x23]\n"
-      "ld1 { v25.b }[4], [x22]\n"
-      "ld1 { v24.b }[4], [x21]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v27.b }[4], [x25]\n"
+      "ld1 { v26.b }[4], [x24]\n"
+      "ld1 { v25.b }[4], [x23]\n"
+      "ld1 { v24.b }[4], [x22]\n"
+      "ld1 { v23.b }[4], [x21]\n"
       "b 9f\n"
       "7:"  // odd_loads_2_0
       "tbz %x[width], #1, 8f\n"
-      "ldr h31, [x28], #0x2\n"
-      "ldr h30, [x27], #0x2\n"
+      "ldr h30, [x28], #0x2\n"
+      "ldr h29, [x27], #0x2\n"
       "mov x20, #0x2\n"
-      "ldr h29, [x26], #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "ldr h27, [x24], #0x2\n"
-      "ldr h26, [x23], #0x2\n"
-      "ldr h25, [x22], #0x2\n"
-      "ldr h24, [x21], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h27, [x25], #0x2\n"
+      "ldr h26, [x24], #0x2\n"
+      "ldr h25, [x23], #0x2\n"
+      "ldr h24, [x22], #0x2\n"
+      "ldr h23, [x21], #0x2\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v31.b }[2], [x28]\n"
-      "ld1 { v30.b }[2], [x27]\n"
+      "ld1 { v30.b }[2], [x28]\n"
+      "ld1 { v29.b }[2], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v29.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
-      "ld1 { v27.b }[2], [x24]\n"
-      "ld1 { v26.b }[2], [x23]\n"
-      "ld1 { v25.b }[2], [x22]\n"
-      "ld1 { v24.b }[2], [x21]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v27.b }[2], [x25]\n"
+      "ld1 { v26.b }[2], [x24]\n"
+      "ld1 { v25.b }[2], [x23]\n"
+      "ld1 { v24.b }[2], [x22]\n"
+      "ld1 { v23.b }[2], [x21]\n"
       "b 9f\n"
       "8:"  // odd_loads_1_0
-      "ldr b31, [x28, #0x0]\n"
-      "ldr b30, [x27, #0x0]\n"
+      "ldr b30, [x28, #0x0]\n"
+      "ldr b29, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr b29, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
-      "ldr b27, [x24, #0x0]\n"
-      "ldr b26, [x23, #0x0]\n"
-      "ldr b25, [x22, #0x0]\n"
-      "ldr b24, [x21, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b27, [x25, #0x0]\n"
+      "ldr b26, [x24, #0x0]\n"
+      "ldr b25, [x23, #0x0]\n"
+      "ldr b24, [x22, #0x0]\n"
+      "ldr b23, [x21, #0x0]\n"
       "9:"  // Odd load end
-      "sshll v31.8h, v31.8b, #0x0\n"
       "sshll v30.8h, v30.8b, #0x0\n"
-      "subs x20, x20, #0x1\n"
       "sshll v29.8h, v29.8b, #0x0\n"
+      "subs x20, x20, #0x1\n"
       "sshll v28.8h, v28.8b, #0x0\n"
       "sshll v27.8h, v27.8b, #0x0\n"
       "sshll v26.8h, v26.8b, #0x0\n"
       "sshll v25.8h, v25.8b, #0x0\n"
       "sshll v24.8h, v24.8b, #0x0\n"
-      "zip1 v23.8h, v31.8h, v27.8h\n"
-      "zip1 v22.8h, v29.8h, v25.8h\n"
-      "zip1 v21.8h, v30.8h, v26.8h\n"
-      "zip1 v20.8h, v28.8h, v24.8h\n"
-      "zip1 v18.8h, v23.8h, v22.8h\n"
-      "zip1 v17.8h, v21.8h, v20.8h\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v16.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v19.8h, v18.8h, v17.8h\n"
-      "subs x20, x20, #0x1\n"
-      "str q19, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v19.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v18.8h, v23.8h, v22.8h\n"
-      "zip2 v17.8h, v21.8h, v20.8h\n"
-      "subs x20, x20, #0x1\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v16.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v16.8h, v18.8h, v17.8h\n"
-      "subs x20, x20, #0x1\n"
-      "str q16, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v16.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v22.8h, v31.8h, v27.8h\n"
-      "zip2 v21.8h, v29.8h, v25.8h\n"
-      "subs x20, x20, #0x1\n"
-      "zip2 v20.8h, v30.8h, v26.8h\n"
-      "zip2 v19.8h, v28.8h, v24.8h\n"
+      "sshll v23.8h, v23.8b, #0x0\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v23.8h\n"
       "zip1 v18.8h, v22.8h, v21.8h\n"
       "zip1 v17.8h, v20.8h, v19.8h\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
@@ -302,10 +270,42 @@
       "beq 10f\n"
       "zip2 v18.8h, v22.8h, v21.8h\n"
       "zip2 v17.8h, v20.8h, v19.8h\n"
+      "subs x20, x20, #0x1\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "10:"  // Odds skip
       "saddw v1.4s, v1.4s, v2.4h\n"
       "saddw2 v0.4s, v0.4s, v2.8h\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
index de29d77..c01d980 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
@@ -159,118 +159,86 @@
       "5:"  // Main loop skip
       "cbz %x[width], 10f\n"
       "tbz %x[width], #2, 7f\n"
-      "ldr d31, [x28], #0x8\n"
-      "ldr d30, [x27], #0x8\n"
-      "ldr d29, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ldr d27, [x24], #0x8\n"
-      "ldr d26, [x23], #0x8\n"
+      "ldr d30, [x28], #0x8\n"
+      "ldr d29, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d26, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
       "ldr d24, [x22], #0x8\n"
       "ldr d23, [x21], #0x8\n"
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v31.s }[2], [x28], #0x4\n"
-      "ld1 { v30.s }[2], [x27], #0x4\n"
+      "ld1 { v30.s }[2], [x28], #0x4\n"
+      "ld1 { v29.s }[2], [x27], #0x4\n"
       "mov x20, #0x6\n"
-      "ld1 { v29.s }[2], [x26], #0x4\n"
-      "ld1 { v28.s }[2], [x25], #0x4\n"
-      "ld1 { v27.s }[2], [x24], #0x4\n"
-      "ld1 { v26.s }[2], [x23], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v27.s }[2], [x25], #0x4\n"
+      "ld1 { v26.s }[2], [x24], #0x4\n"
+      "ld1 { v25.s }[2], [x23], #0x4\n"
       "ld1 { v24.s }[2], [x22], #0x4\n"
       "ld1 { v23.s }[2], [x21], #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v31.h }[6], [x28]\n"
-      "ld1 { v30.h }[6], [x27]\n"
+      "ld1 { v30.h }[6], [x28]\n"
+      "ld1 { v29.h }[6], [x27]\n"
       "mov x20, #0x7\n"
-      "ld1 { v29.h }[6], [x26]\n"
-      "ld1 { v28.h }[6], [x25]\n"
-      "ld1 { v27.h }[6], [x24]\n"
-      "ld1 { v26.h }[6], [x23]\n"
+      "ld1 { v28.h }[6], [x26]\n"
+      "ld1 { v27.h }[6], [x25]\n"
+      "ld1 { v26.h }[6], [x24]\n"
+      "ld1 { v25.h }[6], [x23]\n"
       "ld1 { v24.h }[6], [x22]\n"
       "ld1 { v23.h }[6], [x21]\n"
       "b 9f\n"
       "6:"  // odd_loads_1_4
       "mov x20, #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v31.h }[4], [x28]\n"
-      "ld1 { v30.h }[4], [x27]\n"
+      "ld1 { v30.h }[4], [x28]\n"
+      "ld1 { v29.h }[4], [x27]\n"
       "mov x20, #0x5\n"
-      "ld1 { v29.h }[4], [x26]\n"
-      "ld1 { v28.h }[4], [x25]\n"
-      "ld1 { v27.h }[4], [x24]\n"
-      "ld1 { v26.h }[4], [x23]\n"
+      "ld1 { v28.h }[4], [x26]\n"
+      "ld1 { v27.h }[4], [x25]\n"
+      "ld1 { v26.h }[4], [x24]\n"
+      "ld1 { v25.h }[4], [x23]\n"
       "ld1 { v24.h }[4], [x22]\n"
       "ld1 { v23.h }[4], [x21]\n"
       "b 9f\n"
       "7:"  // odd_loads_2_0
       "tbz %x[width], #1, 8f\n"
-      "ldr s31, [x28], #0x4\n"
-      "ldr s30, [x27], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
       "mov x20, #0x2\n"
-      "ldr s29, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s27, [x24], #0x4\n"
-      "ldr s26, [x23], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
       "ldr s24, [x22], #0x4\n"
       "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v31.h }[2], [x28]\n"
-      "ld1 { v30.h }[2], [x27]\n"
+      "ld1 { v30.h }[2], [x28]\n"
+      "ld1 { v29.h }[2], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v29.h }[2], [x26]\n"
-      "ld1 { v28.h }[2], [x25]\n"
-      "ld1 { v27.h }[2], [x24]\n"
-      "ld1 { v26.h }[2], [x23]\n"
+      "ld1 { v28.h }[2], [x26]\n"
+      "ld1 { v27.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x24]\n"
+      "ld1 { v25.h }[2], [x23]\n"
       "ld1 { v24.h }[2], [x22]\n"
       "ld1 { v23.h }[2], [x21]\n"
       "b 9f\n"
       "8:"  // odd_loads_1_0
-      "ldr h31, [x28, #0x0]\n"
-      "ldr h30, [x27, #0x0]\n"
+      "ldr h30, [x28, #0x0]\n"
+      "ldr h29, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr h29, [x26, #0x0]\n"
-      "ldr h28, [x25, #0x0]\n"
-      "ldr h27, [x24, #0x0]\n"
-      "ldr h26, [x23, #0x0]\n"
+      "ldr h28, [x26, #0x0]\n"
+      "ldr h27, [x25, #0x0]\n"
+      "ldr h26, [x24, #0x0]\n"
+      "ldr h25, [x23, #0x0]\n"
       "ldr h24, [x22, #0x0]\n"
       "ldr h23, [x21, #0x0]\n"
       "9:"  // Odd load end
-      "zip1 v25.8h, v31.8h, v27.8h\n"
-      "zip1 v18.8h, v29.8h, v24.8h\n"
-      "subs x20, x20, #0x1\n"
       "zip1 v22.8h, v30.8h, v26.8h\n"
-      "zip1 v21.8h, v28.8h, v23.8h\n"
-      "zip1 v17.8h, v25.8h, v18.8h\n"
-      "zip1 v16.8h, v22.8h, v21.8h\n"
-      "zip1 v20.8h, v17.8h, v16.8h\n"
-      "str q20, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v20.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v19.8h, v17.8h, v16.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
       "subs x20, x20, #0x1\n"
-      "str q19, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v19.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v18.8h, v25.8h, v18.8h\n"
-      "zip2 v17.8h, v22.8h, v21.8h\n"
-      "subs x20, x20, #0x1\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v16.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v16.8h, v18.8h, v17.8h\n"
-      "subs x20, x20, #0x1\n"
-      "str q16, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v16.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v22.8h, v31.8h, v27.8h\n"
-      "zip2 v21.8h, v29.8h, v24.8h\n"
-      "subs x20, x20, #0x1\n"
-      "zip2 v20.8h, v30.8h, v26.8h\n"
-      "zip2 v19.8h, v28.8h, v23.8h\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v23.8h\n"
       "zip1 v18.8h, v22.8h, v21.8h\n"
       "zip1 v17.8h, v20.8h, v19.8h\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
@@ -286,10 +254,42 @@
       "beq 10f\n"
       "zip2 v18.8h, v22.8h, v21.8h\n"
       "zip2 v17.8h, v20.8h, v19.8h\n"
+      "subs x20, x20, #0x1\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "10:"  // Odds skip
       "uaddw v1.4s, v1.4s, v2.4h\n"
       "uaddw2 v0.4s, v0.4s, v2.8h\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
index 43a3a46..d29a995 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
@@ -80,35 +80,35 @@
       "blt 3f\n"
       "2:"  // Main loop head
       "ldr d25, [x28], #0x8\n"
-      "ldr d30, [x27], #0x8\n"
+      "ldr d27, [x27], #0x8\n"
       "ushll v25.8h, v25.8b, #0x0\n"
-      "ushll v30.8h, v30.8b, #0x0\n"
-      "ldr d29, [x26], #0x8\n"
-      "ldr d28, [x25], #0x8\n"
-      "ushll v29.8h, v29.8b, #0x0\n"
-      "ushll v28.8h, v28.8b, #0x0\n"
-      "ldr d21, [x24], #0x8\n"
-      "ldr d27, [x23], #0x8\n"
-      "ushll v21.8h, v21.8b, #0x0\n"
       "ushll v27.8h, v27.8b, #0x0\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d26, [x21], #0x8\n"
-      "ushll v20.8h, v20.8b, #0x0\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d24, [x25], #0x8\n"
       "ushll v26.8h, v26.8b, #0x0\n"
+      "ushll v24.8h, v24.8b, #0x0\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ushll v21.8h, v21.8b, #0x0\n"
+      "ushll v20.8h, v20.8b, #0x0\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d16, [x21], #0x8\n"
+      "ushll v17.8h, v17.8b, #0x0\n"
+      "ushll v16.8h, v16.8b, #0x0\n"
       "zip1 v23.8h, v25.8h, v21.8h\n"
-      "zip1 v22.8h, v29.8h, v20.8h\n"
+      "zip1 v22.8h, v26.8h, v17.8h\n"
       "subs %x[width], %x[width], #0x8\n"
       "cmp %x[width], #0x8\n"
-      "zip1 v19.8h, v30.8h, v27.8h\n"
-      "zip1 v18.8h, v28.8h, v26.8h\n"
+      "zip1 v19.8h, v27.8h, v20.8h\n"
+      "zip1 v18.8h, v24.8h, v16.8h\n"
       "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
       "zip2 v25.8h, v25.8h, v21.8h\n"
-      "zip2 v21.8h, v29.8h, v20.8h\n"
+      "zip2 v21.8h, v26.8h, v17.8h\n"
       "prfm pldl1keep, [x26, #0x70]\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip2 v20.8h, v30.8h, v27.8h\n"
-      "zip2 v16.8h, v28.8h, v26.8h\n"
+      "zip2 v20.8h, v27.8h, v20.8h\n"
+      "zip2 v16.8h, v24.8h, v16.8h\n"
       "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
       "zip1 v24.8h, v23.8h, v22.8h\n"
@@ -142,140 +142,139 @@
       "3:"  // Main loop skip
       "cbz %x[width], 8f\n"
       "tbz %x[width], #2, 5f\n"
-      "ldr s25, [x28], #0x4\n"
-      "ldr s30, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s21, [x24], #0x4\n"
-      "ldr s27, [x23], #0x4\n"
-      "ldr s20, [x22], #0x4\n"
-      "ldr s26, [x21], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v25.h }[2], [x28], #0x2\n"
-      "ld1 { v30.h }[2], [x27], #0x2\n"
+      "ld1 { v30.h }[2], [x28], #0x2\n"
+      "ld1 { v29.h }[2], [x27], #0x2\n"
       "mov x20, #0x6\n"
-      "ld1 { v29.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "ld1 { v21.h }[2], [x24], #0x2\n"
-      "ld1 { v27.h }[2], [x23], #0x2\n"
-      "ld1 { v20.h }[2], [x22], #0x2\n"
-      "ld1 { v26.h }[2], [x21], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v27.h }[2], [x25], #0x2\n"
+      "ld1 { v26.h }[2], [x24], #0x2\n"
+      "ld1 { v25.h }[2], [x23], #0x2\n"
+      "ld1 { v24.h }[2], [x22], #0x2\n"
+      "ld1 { v23.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v25.b }[6], [x28]\n"
-      "ld1 { v30.b }[6], [x27]\n"
+      "ld1 { v30.b }[6], [x28]\n"
+      "ld1 { v29.b }[6], [x27]\n"
       "mov x20, #0x7\n"
-      "ld1 { v29.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
-      "ld1 { v21.b }[6], [x24]\n"
-      "ld1 { v27.b }[6], [x23]\n"
-      "ld1 { v20.b }[6], [x22]\n"
-      "ld1 { v26.b }[6], [x21]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v27.b }[6], [x25]\n"
+      "ld1 { v26.b }[6], [x24]\n"
+      "ld1 { v25.b }[6], [x23]\n"
+      "ld1 { v24.b }[6], [x22]\n"
+      "ld1 { v23.b }[6], [x21]\n"
       "b 7f\n"
       "4:"  // odd_loads_1_4
       "mov x20, #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v25.b }[4], [x28]\n"
-      "ld1 { v30.b }[4], [x27]\n"
+      "ld1 { v30.b }[4], [x28]\n"
+      "ld1 { v29.b }[4], [x27]\n"
       "mov x20, #0x5\n"
-      "ld1 { v29.b }[4], [x26]\n"
-      "ld1 { v28.b }[4], [x25]\n"
-      "ld1 { v21.b }[4], [x24]\n"
-      "ld1 { v27.b }[4], [x23]\n"
-      "ld1 { v20.b }[4], [x22]\n"
-      "ld1 { v26.b }[4], [x21]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v27.b }[4], [x25]\n"
+      "ld1 { v26.b }[4], [x24]\n"
+      "ld1 { v25.b }[4], [x23]\n"
+      "ld1 { v24.b }[4], [x22]\n"
+      "ld1 { v23.b }[4], [x21]\n"
       "b 7f\n"
       "5:"  // odd_loads_2_0
       "tbz %x[width], #1, 6f\n"
-      "ldr h25, [x28], #0x2\n"
-      "ldr h30, [x27], #0x2\n"
+      "ldr h30, [x28], #0x2\n"
+      "ldr h29, [x27], #0x2\n"
       "mov x20, #0x2\n"
-      "ldr h29, [x26], #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "ldr h21, [x24], #0x2\n"
-      "ldr h27, [x23], #0x2\n"
-      "ldr h20, [x22], #0x2\n"
-      "ldr h26, [x21], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h27, [x25], #0x2\n"
+      "ldr h26, [x24], #0x2\n"
+      "ldr h25, [x23], #0x2\n"
+      "ldr h24, [x22], #0x2\n"
+      "ldr h23, [x21], #0x2\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v25.b }[2], [x28]\n"
-      "ld1 { v30.b }[2], [x27]\n"
+      "ld1 { v30.b }[2], [x28]\n"
+      "ld1 { v29.b }[2], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v29.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
-      "ld1 { v21.b }[2], [x24]\n"
-      "ld1 { v27.b }[2], [x23]\n"
-      "ld1 { v20.b }[2], [x22]\n"
-      "ld1 { v26.b }[2], [x21]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v27.b }[2], [x25]\n"
+      "ld1 { v26.b }[2], [x24]\n"
+      "ld1 { v25.b }[2], [x23]\n"
+      "ld1 { v24.b }[2], [x22]\n"
+      "ld1 { v23.b }[2], [x21]\n"
       "b 7f\n"
       "6:"  // odd_loads_1_0
-      "ldr b25, [x28, #0x0]\n"
-      "ldr b30, [x27, #0x0]\n"
+      "ldr b30, [x28, #0x0]\n"
+      "ldr b29, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr b29, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
-      "ldr b21, [x24, #0x0]\n"
-      "ldr b27, [x23, #0x0]\n"
-      "ldr b20, [x22, #0x0]\n"
-      "ldr b26, [x21, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b27, [x25, #0x0]\n"
+      "ldr b26, [x24, #0x0]\n"
+      "ldr b25, [x23, #0x0]\n"
+      "ldr b24, [x22, #0x0]\n"
+      "ldr b23, [x21, #0x0]\n"
       "7:"  // Odd load end
-      "ushll v25.8h, v25.8b, #0x0\n"
       "ushll v30.8h, v30.8b, #0x0\n"
-      "subs x20, x20, #0x1\n"
       "ushll v29.8h, v29.8b, #0x0\n"
+      "subs x20, x20, #0x1\n"
       "ushll v28.8h, v28.8b, #0x0\n"
-      "ushll v21.8h, v21.8b, #0x0\n"
       "ushll v27.8h, v27.8b, #0x0\n"
-      "ushll v20.8h, v20.8b, #0x0\n"
       "ushll v26.8h, v26.8b, #0x0\n"
-      "zip1 v23.8h, v25.8h, v21.8h\n"
-      "zip1 v22.8h, v29.8h, v20.8h\n"
-      "zip1 v19.8h, v30.8h, v27.8h\n"
-      "zip1 v18.8h, v28.8h, v26.8h\n"
-      "zip1 v24.8h, v23.8h, v22.8h\n"
-      "zip1 v17.8h, v19.8h, v18.8h\n"
-      "zip1 v16.8h, v24.8h, v17.8h\n"
+      "ushll v25.8h, v25.8b, #0x0\n"
+      "ushll v24.8h, v24.8b, #0x0\n"
+      "ushll v23.8h, v23.8b, #0x0\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v16.8h, v24.8h, v17.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v23.8h, v23.8h, v22.8h\n"
-      "zip2 v19.8h, v19.8h, v18.8h\n"
+      "zip2 v18.8h, v22.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
       "subs x20, x20, #0x1\n"
-      "zip1 v17.8h, v23.8h, v19.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 8f\n"
-      "subs x20, x20, #0x1\n"
-      "zip2 v16.8h, v23.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v25.8h, v25.8h, v21.8h\n"
-      "zip2 v21.8h, v29.8h, v20.8h\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v20.8h, v30.8h, v27.8h\n"
-      "zip2 v16.8h, v28.8h, v26.8h\n"
-      "zip1 v22.8h, v25.8h, v21.8h\n"
-      "zip1 v18.8h, v20.8h, v16.8h\n"
-      "zip1 v19.8h, v22.8h, v18.8h\n"
-      "str q19, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v18.8h, v22.8h, v18.8h\n"
-      "str q18, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "beq 8f\n"
-      "zip2 v21.8h, v25.8h, v21.8h\n"
-      "zip2 v20.8h, v20.8h, v16.8h\n"
-      "zip1 v17.8h, v21.8h, v20.8h\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "8:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
       : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
index 3ab2436..ae4bf9b 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
@@ -167,126 +167,94 @@
       "5:"  // Main loop skip
       "cbz %x[width], 10f\n"
       "tbz %x[width], #2, 7f\n"
-      "ldr s31, [x28], #0x4\n"
-      "ldr s30, [x27], #0x4\n"
-      "ldr s29, [x26], #0x4\n"
-      "ldr s28, [x25], #0x4\n"
-      "ldr s27, [x24], #0x4\n"
-      "ldr s26, [x23], #0x4\n"
-      "ldr s25, [x22], #0x4\n"
-      "ldr s24, [x21], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr s29, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v31.h }[2], [x28], #0x2\n"
-      "ld1 { v30.h }[2], [x27], #0x2\n"
+      "ld1 { v30.h }[2], [x28], #0x2\n"
+      "ld1 { v29.h }[2], [x27], #0x2\n"
       "mov x20, #0x6\n"
-      "ld1 { v29.h }[2], [x26], #0x2\n"
-      "ld1 { v28.h }[2], [x25], #0x2\n"
-      "ld1 { v27.h }[2], [x24], #0x2\n"
-      "ld1 { v26.h }[2], [x23], #0x2\n"
-      "ld1 { v25.h }[2], [x22], #0x2\n"
-      "ld1 { v24.h }[2], [x21], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v27.h }[2], [x25], #0x2\n"
+      "ld1 { v26.h }[2], [x24], #0x2\n"
+      "ld1 { v25.h }[2], [x23], #0x2\n"
+      "ld1 { v24.h }[2], [x22], #0x2\n"
+      "ld1 { v23.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v31.b }[6], [x28]\n"
-      "ld1 { v30.b }[6], [x27]\n"
+      "ld1 { v30.b }[6], [x28]\n"
+      "ld1 { v29.b }[6], [x27]\n"
       "mov x20, #0x7\n"
-      "ld1 { v29.b }[6], [x26]\n"
-      "ld1 { v28.b }[6], [x25]\n"
-      "ld1 { v27.b }[6], [x24]\n"
-      "ld1 { v26.b }[6], [x23]\n"
-      "ld1 { v25.b }[6], [x22]\n"
-      "ld1 { v24.b }[6], [x21]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v27.b }[6], [x25]\n"
+      "ld1 { v26.b }[6], [x24]\n"
+      "ld1 { v25.b }[6], [x23]\n"
+      "ld1 { v24.b }[6], [x22]\n"
+      "ld1 { v23.b }[6], [x21]\n"
       "b 9f\n"
       "6:"  // odd_loads_1_4
       "mov x20, #0x4\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v31.b }[4], [x28]\n"
-      "ld1 { v30.b }[4], [x27]\n"
+      "ld1 { v30.b }[4], [x28]\n"
+      "ld1 { v29.b }[4], [x27]\n"
       "mov x20, #0x5\n"
-      "ld1 { v29.b }[4], [x26]\n"
-      "ld1 { v28.b }[4], [x25]\n"
-      "ld1 { v27.b }[4], [x24]\n"
-      "ld1 { v26.b }[4], [x23]\n"
-      "ld1 { v25.b }[4], [x22]\n"
-      "ld1 { v24.b }[4], [x21]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v27.b }[4], [x25]\n"
+      "ld1 { v26.b }[4], [x24]\n"
+      "ld1 { v25.b }[4], [x23]\n"
+      "ld1 { v24.b }[4], [x22]\n"
+      "ld1 { v23.b }[4], [x21]\n"
       "b 9f\n"
       "7:"  // odd_loads_2_0
       "tbz %x[width], #1, 8f\n"
-      "ldr h31, [x28], #0x2\n"
-      "ldr h30, [x27], #0x2\n"
+      "ldr h30, [x28], #0x2\n"
+      "ldr h29, [x27], #0x2\n"
       "mov x20, #0x2\n"
-      "ldr h29, [x26], #0x2\n"
-      "ldr h28, [x25], #0x2\n"
-      "ldr h27, [x24], #0x2\n"
-      "ldr h26, [x23], #0x2\n"
-      "ldr h25, [x22], #0x2\n"
-      "ldr h24, [x21], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h27, [x25], #0x2\n"
+      "ldr h26, [x24], #0x2\n"
+      "ldr h25, [x23], #0x2\n"
+      "ldr h24, [x22], #0x2\n"
+      "ldr h23, [x21], #0x2\n"
       "tbz %x[width], #0, 9f\n"
-      "ld1 { v31.b }[2], [x28]\n"
-      "ld1 { v30.b }[2], [x27]\n"
+      "ld1 { v30.b }[2], [x28]\n"
+      "ld1 { v29.b }[2], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v29.b }[2], [x26]\n"
-      "ld1 { v28.b }[2], [x25]\n"
-      "ld1 { v27.b }[2], [x24]\n"
-      "ld1 { v26.b }[2], [x23]\n"
-      "ld1 { v25.b }[2], [x22]\n"
-      "ld1 { v24.b }[2], [x21]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v27.b }[2], [x25]\n"
+      "ld1 { v26.b }[2], [x24]\n"
+      "ld1 { v25.b }[2], [x23]\n"
+      "ld1 { v24.b }[2], [x22]\n"
+      "ld1 { v23.b }[2], [x21]\n"
       "b 9f\n"
       "8:"  // odd_loads_1_0
-      "ldr b31, [x28, #0x0]\n"
-      "ldr b30, [x27, #0x0]\n"
+      "ldr b30, [x28, #0x0]\n"
+      "ldr b29, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr b29, [x26, #0x0]\n"
-      "ldr b28, [x25, #0x0]\n"
-      "ldr b27, [x24, #0x0]\n"
-      "ldr b26, [x23, #0x0]\n"
-      "ldr b25, [x22, #0x0]\n"
-      "ldr b24, [x21, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b27, [x25, #0x0]\n"
+      "ldr b26, [x24, #0x0]\n"
+      "ldr b25, [x23, #0x0]\n"
+      "ldr b24, [x22, #0x0]\n"
+      "ldr b23, [x21, #0x0]\n"
       "9:"  // Odd load end
-      "ushll v31.8h, v31.8b, #0x0\n"
       "ushll v30.8h, v30.8b, #0x0\n"
-      "subs x20, x20, #0x1\n"
       "ushll v29.8h, v29.8b, #0x0\n"
+      "subs x20, x20, #0x1\n"
       "ushll v28.8h, v28.8b, #0x0\n"
       "ushll v27.8h, v27.8b, #0x0\n"
       "ushll v26.8h, v26.8b, #0x0\n"
       "ushll v25.8h, v25.8b, #0x0\n"
       "ushll v24.8h, v24.8b, #0x0\n"
-      "zip1 v23.8h, v31.8h, v27.8h\n"
-      "zip1 v22.8h, v29.8h, v25.8h\n"
-      "zip1 v21.8h, v30.8h, v26.8h\n"
-      "zip1 v20.8h, v28.8h, v24.8h\n"
-      "zip1 v18.8h, v23.8h, v22.8h\n"
-      "zip1 v17.8h, v21.8h, v20.8h\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v16.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v19.8h, v18.8h, v17.8h\n"
-      "subs x20, x20, #0x1\n"
-      "str q19, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v19.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v18.8h, v23.8h, v22.8h\n"
-      "zip2 v17.8h, v21.8h, v20.8h\n"
-      "subs x20, x20, #0x1\n"
-      "zip1 v16.8h, v18.8h, v17.8h\n"
-      "str q16, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v16.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v16.8h, v18.8h, v17.8h\n"
-      "subs x20, x20, #0x1\n"
-      "str q16, [%x[out_ptr], #0x0]\n"
-      "add v2.8h, v2.8h, v16.8h\n"
-      "add %x[out_ptr], %x[out_ptr], #0x10\n"
-      "beq 10f\n"
-      "zip2 v22.8h, v31.8h, v27.8h\n"
-      "zip2 v21.8h, v29.8h, v25.8h\n"
-      "subs x20, x20, #0x1\n"
-      "zip2 v20.8h, v30.8h, v26.8h\n"
-      "zip2 v19.8h, v28.8h, v24.8h\n"
+      "ushll v23.8h, v23.8b, #0x0\n"
+      "zip1 v22.8h, v30.8h, v26.8h\n"
+      "zip1 v21.8h, v28.8h, v24.8h\n"
+      "zip1 v20.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v23.8h\n"
       "zip1 v18.8h, v22.8h, v21.8h\n"
       "zip1 v17.8h, v20.8h, v19.8h\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
@@ -302,10 +270,42 @@
       "beq 10f\n"
       "zip2 v18.8h, v22.8h, v21.8h\n"
       "zip2 v17.8h, v20.8h, v19.8h\n"
+      "subs x20, x20, #0x1\n"
       "zip1 v16.8h, v18.8h, v17.8h\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "add v2.8h, v2.8h, v16.8h\n"
       "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v22.8h, v30.8h, v26.8h\n"
+      "zip2 v21.8h, v28.8h, v24.8h\n"
+      "subs x20, x20, #0x1\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v23.8h\n"
+      "zip1 v18.8h, v22.8h, v21.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x20, x20, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v17.8h, v22.8h, v21.8h\n"
+      "zip2 v16.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v2.8h, v2.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
       "10:"  // Odds skip
       "uaddw v1.4s, v1.4s, v2.4h\n"
       "uaddw2 v0.4s, v0.4s, v2.8h\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
index d4d1504..43d9d20 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
@@ -79,29 +79,29 @@
       "prfm pldl1keep, [x21, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q28, [x28], #0x10\n"
-      "ldr q27, [x27], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "ldr q18, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x8\n"
       "cmp %x[width], #0x8\n"
-      "ldr q22, [x26], #0x10\n"
-      "ldr q21, [x25], #0x10\n"
-      "zip1 v26.4s, v28.4s, v22.4s\n"
-      "zip1 v25.4s, v27.4s, v21.4s\n"
-      "ldr q24, [x24], #0x10\n"
+      "ldr q17, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v25.4s, v20.4s, v17.4s\n"
+      "zip1 v24.4s, v18.4s, v16.4s\n"
+      "ldr q19, [x24], #0x10\n"
       "ldr q23, [x23], #0x10\n"
-      "zip2 v22.4s, v28.4s, v22.4s\n"
-      "zip2 v21.4s, v27.4s, v21.4s\n"
-      "ldr q19, [x22], #0x10\n"
-      "ldr q18, [x21], #0x10\n"
-      "zip1 v20.4s, v24.4s, v19.4s\n"
-      "zip1 v17.4s, v23.4s, v18.4s\n"
-      "zip2 v19.4s, v24.4s, v19.4s\n"
-      "zip2 v18.4s, v23.4s, v18.4s\n"
+      "zip2 v22.4s, v20.4s, v17.4s\n"
+      "zip2 v21.4s, v18.4s, v16.4s\n"
+      "ldr q18, [x22], #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      "zip1 v20.4s, v19.4s, v18.4s\n"
+      "zip1 v17.4s, v23.4s, v16.4s\n"
+      "zip2 v19.4s, v19.4s, v18.4s\n"
+      "zip2 v18.4s, v23.4s, v16.4s\n"
       "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
       "prfm pldl1keep, [x26, #0x70]\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v16.4s, v26.4s, v25.4s\n"
+      "zip1 v16.4s, v25.4s, v24.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
@@ -109,7 +109,7 @@
       "str q16, [%x[out_ptr], #0x10]\n"
       "prfm pldl1keep, [x22, #0x70]\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "zip2 v16.4s, v26.4s, v25.4s\n"
+      "zip2 v16.4s, v25.4s, v24.4s\n"
       "str q16, [%x[out_ptr], #0x20]\n"
       "zip2 v16.4s, v20.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x30]\n"
@@ -128,32 +128,32 @@
       "tbz %x[width], #2, 5f\n"
       "ldr d28, [x28], #0x8\n"
       "ldr d27, [x27], #0x8\n"
-      "ldr d22, [x26], #0x8\n"
-      "ldr d21, [x25], #0x8\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
       "ldr d24, [x24], #0x8\n"
       "ldr d23, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d18, [x21], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d21, [x21], #0x8\n"
       "tbz %x[width], #1, 4f\n"
       "ld1 { v28.s }[2], [x28], #0x4\n"
       "ld1 { v27.s }[2], [x27], #0x4\n"
       "mov x20, #0x3\n"
-      "ld1 { v22.s }[2], [x26], #0x4\n"
-      "ld1 { v21.s }[2], [x25], #0x4\n"
+      "ld1 { v26.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
       "ld1 { v24.s }[2], [x24], #0x4\n"
       "ld1 { v23.s }[2], [x23], #0x4\n"
-      "ld1 { v19.s }[2], [x22], #0x4\n"
-      "ld1 { v18.s }[2], [x21], #0x4\n"
+      "ld1 { v22.s }[2], [x22], #0x4\n"
+      "ld1 { v21.s }[2], [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
       "ld1 { v28.h }[6], [x28]\n"
       "ld1 { v27.h }[6], [x27]\n"
       "mov x20, #0x4\n"
-      "ld1 { v22.h }[6], [x26]\n"
-      "ld1 { v21.h }[6], [x25]\n"
+      "ld1 { v26.h }[6], [x26]\n"
+      "ld1 { v25.h }[6], [x25]\n"
       "ld1 { v24.h }[6], [x24]\n"
       "ld1 { v23.h }[6], [x23]\n"
-      "ld1 { v19.h }[6], [x22]\n"
-      "ld1 { v18.h }[6], [x21]\n"
+      "ld1 { v22.h }[6], [x22]\n"
+      "ld1 { v21.h }[6], [x21]\n"
       "b 7f\n"
       "4:"  // odd_loads_1_4
       "mov x20, #0x2\n"
@@ -161,82 +161,81 @@
       "ld1 { v28.h }[4], [x28]\n"
       "ld1 { v27.h }[4], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v22.h }[4], [x26]\n"
-      "ld1 { v21.h }[4], [x25]\n"
+      "ld1 { v26.h }[4], [x26]\n"
+      "ld1 { v25.h }[4], [x25]\n"
       "ld1 { v24.h }[4], [x24]\n"
       "ld1 { v23.h }[4], [x23]\n"
-      "ld1 { v19.h }[4], [x22]\n"
-      "ld1 { v18.h }[4], [x21]\n"
+      "ld1 { v22.h }[4], [x22]\n"
+      "ld1 { v21.h }[4], [x21]\n"
       "b 7f\n"
       "5:"  // odd_loads_2_0
       "tbz %x[width], #1, 6f\n"
       "ldr s28, [x28], #0x4\n"
       "ldr s27, [x27], #0x4\n"
       "mov x20, #0x1\n"
-      "ldr s22, [x26], #0x4\n"
-      "ldr s21, [x25], #0x4\n"
+      "ldr s26, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
       "ldr s24, [x24], #0x4\n"
       "ldr s23, [x23], #0x4\n"
-      "ldr s19, [x22], #0x4\n"
-      "ldr s18, [x21], #0x4\n"
+      "ldr s22, [x22], #0x4\n"
+      "ldr s21, [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
       "ld1 { v28.h }[2], [x28]\n"
       "ld1 { v27.h }[2], [x27]\n"
       "mov x20, #0x2\n"
-      "ld1 { v22.h }[2], [x26]\n"
-      "ld1 { v21.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x26]\n"
+      "ld1 { v25.h }[2], [x25]\n"
       "ld1 { v24.h }[2], [x24]\n"
       "ld1 { v23.h }[2], [x23]\n"
-      "ld1 { v19.h }[2], [x22]\n"
-      "ld1 { v18.h }[2], [x21]\n"
+      "ld1 { v22.h }[2], [x22]\n"
+      "ld1 { v21.h }[2], [x21]\n"
       "b 7f\n"
       "6:"  // odd_loads_1_0
       "ldr h28, [x28, #0x0]\n"
       "ldr h27, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr h22, [x26, #0x0]\n"
-      "ldr h21, [x25, #0x0]\n"
+      "ldr h26, [x26, #0x0]\n"
+      "ldr h25, [x25, #0x0]\n"
       "ldr h24, [x24, #0x0]\n"
       "ldr h23, [x23, #0x0]\n"
-      "ldr h19, [x22, #0x0]\n"
-      "ldr h18, [x21, #0x0]\n"
+      "ldr h22, [x22, #0x0]\n"
+      "ldr h21, [x21, #0x0]\n"
       "7:"  // Odd load end
-      "zip1 v26.4s, v28.4s, v22.4s\n"
-      "zip1 v25.4s, v27.4s, v21.4s\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "zip1 v19.4s, v27.4s, v25.4s\n"
       "subs x20, x20, #0x1\n"
-      "zip1 v20.4s, v24.4s, v19.4s\n"
-      "zip1 v17.4s, v23.4s, v18.4s\n"
-      "zip1 v16.4s, v26.4s, v25.4s\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip1 v16.4s, v20.4s, v17.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 8f\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v16.4s, v26.4s, v25.4s\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip2 v16.4s, v20.4s, v17.4s\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 8f\n"
-      "zip2 v22.4s, v28.4s, v22.4s\n"
-      "zip2 v21.4s, v27.4s, v21.4s\n"
+      "zip2 v20.4s, v28.4s, v26.4s\n"
+      "zip2 v19.4s, v27.4s, v25.4s\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v19.4s, v24.4s, v19.4s\n"
-      "zip2 v18.4s, v23.4s, v18.4s\n"
-      "zip1 v16.4s, v22.4s, v21.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 8f\n"
-      "zip2 v17.4s, v22.4s, v21.4s\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "zip2 v16.4s, v19.4s, v18.4s\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "8:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
       : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
index 358b83a..3ec0337 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
@@ -79,18 +79,18 @@
       "prfm pldl1keep, [x21, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q26, [x28], #0x10\n"
-      "ldr q21, [x27], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "ldr q19, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x4\n"
       "cmp %x[width], #0x4\n"
       "ldr q25, [x26], #0x10\n"
       "ldr q24, [x25], #0x10\n"
-      "zip1 v16.2d, v26.2d, v21.2d\n"
+      "zip1 v16.2d, v20.2d, v19.2d\n"
       "zip1 v18.2d, v25.2d, v24.2d\n"
       "ldr q23, [x24], #0x10\n"
       "ldr q22, [x23], #0x10\n"
       "zip1 v17.2d, v23.2d, v22.2d\n"
-      "zip2 v21.2d, v26.2d, v21.2d\n"
+      "zip2 v21.2d, v20.2d, v19.2d\n"
       "ldr q20, [x22], #0x10\n"
       "ldr q19, [x21], #0x10\n"
       "str q16, [%x[out_ptr], #0x0]\n"
@@ -118,62 +118,61 @@
       "3:"  // Main loop skip
       "cbz %x[width], 6f\n"
       "tbz %x[width], #1, 4f\n"
-      "ldr d26, [x28], #0x8\n"
-      "ldr d21, [x27], #0x8\n"
+      "ldr d25, [x28], #0x8\n"
+      "ldr d24, [x27], #0x8\n"
       "mov x20, #0x1\n"
-      "ldr d25, [x26], #0x8\n"
-      "ldr d24, [x25], #0x8\n"
-      "ldr d23, [x24], #0x8\n"
-      "ldr d22, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d19, [x21], #0x8\n"
+      "ldr d23, [x26], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d18, [x21], #0x8\n"
       "tbz %x[width], #0, 5f\n"
-      "ld1 { v26.s }[2], [x28]\n"
-      "ld1 { v21.s }[2], [x27]\n"
+      "ld1 { v25.s }[2], [x28]\n"
+      "ld1 { v24.s }[2], [x27]\n"
       "mov x20, #0x2\n"
-      "ld1 { v25.s }[2], [x26]\n"
-      "ld1 { v24.s }[2], [x25]\n"
-      "ld1 { v23.s }[2], [x24]\n"
-      "ld1 { v22.s }[2], [x23]\n"
-      "ld1 { v20.s }[2], [x22]\n"
-      "ld1 { v19.s }[2], [x21]\n"
+      "ld1 { v23.s }[2], [x26]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v21.s }[2], [x24]\n"
+      "ld1 { v20.s }[2], [x23]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "ld1 { v18.s }[2], [x21]\n"
       "b 5f\n"
       "4:"  // odd_loads_1_0
-      "ldr s26, [x28, #0x0]\n"
-      "ldr s21, [x27, #0x0]\n"
+      "ldr s25, [x28, #0x0]\n"
+      "ldr s24, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr s25, [x26, #0x0]\n"
-      "ldr s24, [x25, #0x0]\n"
-      "ldr s23, [x24, #0x0]\n"
-      "ldr s22, [x23, #0x0]\n"
-      "ldr s20, [x22, #0x0]\n"
-      "ldr s19, [x21, #0x0]\n"
+      "ldr s23, [x26, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s21, [x24, #0x0]\n"
+      "ldr s20, [x23, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "ldr s18, [x21, #0x0]\n"
       "5:"  // Odd load end
       "subs x20, x20, #0x1\n"
-      "zip1 v16.2d, v26.2d, v21.2d\n"
+      "zip1 v16.2d, v25.2d, v24.2d\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip1 v18.2d, v25.2d, v24.2d\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
-      "zip1 v17.2d, v23.2d, v22.2d\n"
-      "zip1 v16.2d, v20.2d, v19.2d\n"
+      "zip1 v16.2d, v23.2d, v22.2d\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip1 v17.2d, v21.2d, v20.2d\n"
+      "zip1 v16.2d, v19.2d, v18.2d\n"
       "str q17, [%x[out_ptr], #0x20]\n"
       "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "beq 6f\n"
-      "zip2 v21.2d, v26.2d, v21.2d\n"
-      "str q21, [%x[out_ptr], #0x0]\n"
-      "zip2 v18.2d, v25.2d, v24.2d\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
-      "zip2 v17.2d, v23.2d, v22.2d\n"
-      "zip2 v16.2d, v20.2d, v19.2d\n"
+      "zip2 v16.2d, v25.2d, v24.2d\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v23.2d, v22.2d\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v17.2d, v21.2d, v20.2d\n"
+      "zip2 v16.2d, v19.2d, v18.2d\n"
       "str q17, [%x[out_ptr], #0x20]\n"
       "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "6:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
index d606d5a..e9799f8 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
@@ -79,18 +79,18 @@
       "prfm pldl1keep, [x21, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q26, [x28], #0x10\n"
-      "ldr q21, [x27], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "ldr q19, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x8\n"
       "cmp %x[width], #0x8\n"
       "ldr q25, [x26], #0x10\n"
       "ldr q24, [x25], #0x10\n"
-      "zip1 v16.2d, v26.2d, v21.2d\n"
+      "zip1 v16.2d, v20.2d, v19.2d\n"
       "zip1 v18.2d, v25.2d, v24.2d\n"
       "ldr q23, [x24], #0x10\n"
       "ldr q22, [x23], #0x10\n"
       "zip1 v17.2d, v23.2d, v22.2d\n"
-      "zip2 v21.2d, v26.2d, v21.2d\n"
+      "zip2 v21.2d, v20.2d, v19.2d\n"
       "ldr q20, [x22], #0x10\n"
       "ldr q19, [x21], #0x10\n"
       "str q16, [%x[out_ptr], #0x0]\n"
@@ -118,104 +118,103 @@
       "3:"  // Main loop skip
       "cbz %x[width], 8f\n"
       "tbz %x[width], #2, 5f\n"
-      "ldr d26, [x28], #0x8\n"
-      "ldr d21, [x27], #0x8\n"
-      "ldr d25, [x26], #0x8\n"
-      "ldr d24, [x25], #0x8\n"
-      "ldr d23, [x24], #0x8\n"
-      "ldr d22, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d19, [x21], #0x8\n"
+      "ldr d25, [x28], #0x8\n"
+      "ldr d24, [x27], #0x8\n"
+      "ldr d23, [x26], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d18, [x21], #0x8\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v26.s }[2], [x28], #0x4\n"
-      "ld1 { v21.s }[2], [x27], #0x4\n"
+      "ld1 { v25.s }[2], [x28], #0x4\n"
+      "ld1 { v24.s }[2], [x27], #0x4\n"
       "mov x20, #0x2\n"
-      "ld1 { v25.s }[2], [x26], #0x4\n"
-      "ld1 { v24.s }[2], [x25], #0x4\n"
-      "ld1 { v23.s }[2], [x24], #0x4\n"
-      "ld1 { v22.s }[2], [x23], #0x4\n"
-      "ld1 { v20.s }[2], [x22], #0x4\n"
-      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v23.s }[2], [x26], #0x4\n"
+      "ld1 { v22.s }[2], [x25], #0x4\n"
+      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v20.s }[2], [x23], #0x4\n"
+      "ld1 { v19.s }[2], [x22], #0x4\n"
+      "ld1 { v18.s }[2], [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v26.h }[6], [x28]\n"
-      "ld1 { v21.h }[6], [x27]\n"
-      "ld1 { v25.h }[6], [x26]\n"
-      "ld1 { v24.h }[6], [x25]\n"
-      "ld1 { v23.h }[6], [x24]\n"
-      "ld1 { v22.h }[6], [x23]\n"
-      "ld1 { v20.h }[6], [x22]\n"
-      "ld1 { v19.h }[6], [x21]\n"
+      "ld1 { v25.h }[6], [x28]\n"
+      "ld1 { v24.h }[6], [x27]\n"
+      "ld1 { v23.h }[6], [x26]\n"
+      "ld1 { v22.h }[6], [x25]\n"
+      "ld1 { v21.h }[6], [x24]\n"
+      "ld1 { v20.h }[6], [x23]\n"
+      "ld1 { v19.h }[6], [x22]\n"
+      "ld1 { v18.h }[6], [x21]\n"
       "b 7f\n"
       "4:"  // odd_loads_1_4
       "mov x20, #0x1\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v26.h }[4], [x28]\n"
-      "ld1 { v21.h }[4], [x27]\n"
+      "ld1 { v25.h }[4], [x28]\n"
+      "ld1 { v24.h }[4], [x27]\n"
       "mov x20, #0x2\n"
-      "ld1 { v25.h }[4], [x26]\n"
-      "ld1 { v24.h }[4], [x25]\n"
-      "ld1 { v23.h }[4], [x24]\n"
-      "ld1 { v22.h }[4], [x23]\n"
-      "ld1 { v20.h }[4], [x22]\n"
-      "ld1 { v19.h }[4], [x21]\n"
+      "ld1 { v23.h }[4], [x26]\n"
+      "ld1 { v22.h }[4], [x25]\n"
+      "ld1 { v21.h }[4], [x24]\n"
+      "ld1 { v20.h }[4], [x23]\n"
+      "ld1 { v19.h }[4], [x22]\n"
+      "ld1 { v18.h }[4], [x21]\n"
       "b 7f\n"
       "5:"  // odd_loads_2_0
       "tbz %x[width], #1, 6f\n"
-      "ldr s26, [x28], #0x4\n"
-      "ldr s21, [x27], #0x4\n"
+      "ldr s25, [x28], #0x4\n"
+      "ldr s24, [x27], #0x4\n"
       "mov x20, #0x1\n"
-      "ldr s25, [x26], #0x4\n"
-      "ldr s24, [x25], #0x4\n"
-      "ldr s23, [x24], #0x4\n"
-      "ldr s22, [x23], #0x4\n"
-      "ldr s20, [x22], #0x4\n"
-      "ldr s19, [x21], #0x4\n"
+      "ldr s23, [x26], #0x4\n"
+      "ldr s22, [x25], #0x4\n"
+      "ldr s21, [x24], #0x4\n"
+      "ldr s20, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "ldr s18, [x21], #0x4\n"
       "tbz %x[width], #0, 7f\n"
-      "ld1 { v26.h }[2], [x28]\n"
-      "ld1 { v21.h }[2], [x27]\n"
-      "ld1 { v25.h }[2], [x26]\n"
-      "ld1 { v24.h }[2], [x25]\n"
-      "ld1 { v23.h }[2], [x24]\n"
-      "ld1 { v22.h }[2], [x23]\n"
-      "ld1 { v20.h }[2], [x22]\n"
-      "ld1 { v19.h }[2], [x21]\n"
+      "ld1 { v25.h }[2], [x28]\n"
+      "ld1 { v24.h }[2], [x27]\n"
+      "ld1 { v23.h }[2], [x26]\n"
+      "ld1 { v22.h }[2], [x25]\n"
+      "ld1 { v21.h }[2], [x24]\n"
+      "ld1 { v20.h }[2], [x23]\n"
+      "ld1 { v19.h }[2], [x22]\n"
+      "ld1 { v18.h }[2], [x21]\n"
       "b 7f\n"
       "6:"  // odd_loads_1_0
-      "ldr h26, [x28, #0x0]\n"
-      "ldr h21, [x27, #0x0]\n"
+      "ldr h25, [x28, #0x0]\n"
+      "ldr h24, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr h25, [x26, #0x0]\n"
-      "ldr h24, [x25, #0x0]\n"
-      "ldr h23, [x24, #0x0]\n"
-      "ldr h22, [x23, #0x0]\n"
-      "ldr h20, [x22, #0x0]\n"
-      "ldr h19, [x21, #0x0]\n"
+      "ldr h23, [x26, #0x0]\n"
+      "ldr h22, [x25, #0x0]\n"
+      "ldr h21, [x24, #0x0]\n"
+      "ldr h20, [x23, #0x0]\n"
+      "ldr h19, [x22, #0x0]\n"
+      "ldr h18, [x21, #0x0]\n"
       "7:"  // Odd load end
       "subs x20, x20, #0x1\n"
-      "zip1 v16.2d, v26.2d, v21.2d\n"
+      "zip1 v16.2d, v25.2d, v24.2d\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip1 v18.2d, v25.2d, v24.2d\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
-      "zip1 v17.2d, v23.2d, v22.2d\n"
-      "zip1 v16.2d, v20.2d, v19.2d\n"
+      "zip1 v16.2d, v23.2d, v22.2d\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip1 v17.2d, v21.2d, v20.2d\n"
+      "zip1 v16.2d, v19.2d, v18.2d\n"
       "str q17, [%x[out_ptr], #0x20]\n"
       "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "beq 8f\n"
-      "zip2 v21.2d, v26.2d, v21.2d\n"
-      "str q21, [%x[out_ptr], #0x0]\n"
-      "zip2 v18.2d, v25.2d, v24.2d\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
-      "zip2 v17.2d, v23.2d, v22.2d\n"
-      "zip2 v16.2d, v20.2d, v19.2d\n"
+      "zip2 v16.2d, v25.2d, v24.2d\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v23.2d, v22.2d\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v17.2d, v21.2d, v20.2d\n"
+      "zip2 v16.2d, v19.2d, v18.2d\n"
       "str q17, [%x[out_ptr], #0x20]\n"
       "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "8:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp
index dfec143..730bfd6 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp
@@ -79,14 +79,14 @@
       "prfm pldl1keep, [x21, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q23, [x28], #0x10\n"
-      "ldr q22, [x26], #0x10\n"
-      ".inst 0x0ea16af7  // bfcvtn v23.4h, v23.4s\n"
-      ".inst 0x0ea16ad6  // bfcvtn v22.4h, v22.4s\n"
-      "ldr q21, [x24], #0x10\n"
-      "ldr q20, [x22], #0x10\n"
-      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
-      ".inst 0x0ea16a94  // bfcvtn v20.4h, v20.4s\n"
+      "ldr q17, [x28], #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      ".inst 0x0ea16a37  // bfcvtn v23.4h, v17.4s\n"
+      ".inst 0x0ea16a16  // bfcvtn v22.4h, v16.4s\n"
+      "ldr q17, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      ".inst 0x0ea16a35  // bfcvtn v21.4h, v17.4s\n"
+      ".inst 0x0ea16a14  // bfcvtn v20.4h, v16.4s\n"
       "ldr q19, [x27], #0x10\n"
       "ldr q18, [x25], #0x10\n"
       "subs %x[width], %x[width], #0x4\n"
@@ -114,51 +114,50 @@
       "3:"  // Main loop skip
       "cbz %x[width], 6f\n"
       "tbz %x[width], #1, 4f\n"
-      "ldr d23, [x28], #0x8\n"
-      "ldr d19, [x27], #0x8\n"
+      "ldr d19, [x28], #0x8\n"
+      "ldr d23, [x27], #0x8\n"
       "mov x20, #0x1\n"
-      "ldr d22, [x26], #0x8\n"
-      "ldr d18, [x25], #0x8\n"
-      "ldr d21, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d16, [x21], #0x8\n"
+      "ldr d18, [x26], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d17, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "ldr d20, [x21], #0x8\n"
       "tbz %x[width], #0, 5f\n"
-      "ld1 { v23.s }[2], [x28]\n"
-      "ld1 { v19.s }[2], [x27]\n"
-      "ld1 { v22.s }[2], [x26]\n"
-      "ld1 { v18.s }[2], [x25]\n"
-      "ld1 { v21.s }[2], [x24]\n"
-      "ld1 { v17.s }[2], [x23]\n"
-      "ld1 { v20.s }[2], [x22]\n"
-      "ld1 { v16.s }[2], [x21]\n"
+      "ld1 { v19.s }[2], [x28]\n"
+      "ld1 { v23.s }[2], [x27]\n"
+      "ld1 { v18.s }[2], [x26]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v17.s }[2], [x24]\n"
+      "ld1 { v21.s }[2], [x23]\n"
+      "ld1 { v16.s }[2], [x22]\n"
+      "ld1 { v20.s }[2], [x21]\n"
       "b 5f\n"
       "4:"  // odd_loads_1_0
-      "ldr s23, [x28, #0x0]\n"
-      "ldr s19, [x27, #0x0]\n"
+      "ldr s19, [x28, #0x0]\n"
+      "ldr s23, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr s22, [x26, #0x0]\n"
-      "ldr s18, [x25, #0x0]\n"
-      "ldr s21, [x24, #0x0]\n"
-      "ldr s17, [x23, #0x0]\n"
-      "ldr s20, [x22, #0x0]\n"
-      "ldr s16, [x21, #0x0]\n"
+      "ldr s18, [x26, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s17, [x24, #0x0]\n"
+      "ldr s21, [x23, #0x0]\n"
+      "ldr s16, [x22, #0x0]\n"
+      "ldr s20, [x21, #0x0]\n"
       "5:"  // Odd load end
-      ".inst 0x0ea16af7  // bfcvtn v23.4h, v23.4s\n"
-      ".inst 0x0ea16ad6  // bfcvtn v22.4h, v22.4s\n"
-      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
-      ".inst 0x0ea16a94  // bfcvtn v20.4h, v20.4s\n"
-      ".inst 0x4ea16a77  // bfcvtn2 v23.8h, v19.4s\n"
-      ".inst 0x4ea16a56  // bfcvtn2 v22.8h, v18.4s\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      ".inst 0x4ea16a35  // bfcvtn2 v21.8h, v17.4s\n"
-      ".inst 0x4ea16a14  // bfcvtn2 v20.8h, v16.4s\n"
-      "str q22, [%x[out_ptr], #0x10]\n"
-      "str q21, [%x[out_ptr], #0x20]\n"
-      "str q20, [%x[out_ptr], #0x30]\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x4ea16af3  // bfcvtn2 v19.8h, v23.4s\n"
+      ".inst 0x4ea16ad2  // bfcvtn2 v18.8h, v22.4s\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      ".inst 0x4ea16ab1  // bfcvtn2 v17.8h, v21.4s\n"
+      ".inst 0x4ea16a90  // bfcvtn2 v16.8h, v20.4s\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "6:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
       : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
index 54f15f8..15d8ddb 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
@@ -79,29 +79,29 @@
       "prfm pldl1keep, [x21, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q28, [x28], #0x10\n"
-      "ldr q27, [x27], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "ldr q18, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x10\n"
       "cmp %x[width], #0x10\n"
-      "ldr q22, [x26], #0x10\n"
-      "ldr q21, [x25], #0x10\n"
-      "zip1 v26.4s, v28.4s, v22.4s\n"
-      "zip1 v25.4s, v27.4s, v21.4s\n"
-      "ldr q24, [x24], #0x10\n"
+      "ldr q17, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v25.4s, v20.4s, v17.4s\n"
+      "zip1 v24.4s, v18.4s, v16.4s\n"
+      "ldr q19, [x24], #0x10\n"
       "ldr q23, [x23], #0x10\n"
-      "zip2 v22.4s, v28.4s, v22.4s\n"
-      "zip2 v21.4s, v27.4s, v21.4s\n"
-      "ldr q19, [x22], #0x10\n"
-      "ldr q18, [x21], #0x10\n"
-      "zip1 v20.4s, v24.4s, v19.4s\n"
-      "zip1 v17.4s, v23.4s, v18.4s\n"
-      "zip2 v19.4s, v24.4s, v19.4s\n"
-      "zip2 v18.4s, v23.4s, v18.4s\n"
+      "zip2 v22.4s, v20.4s, v17.4s\n"
+      "zip2 v21.4s, v18.4s, v16.4s\n"
+      "ldr q18, [x22], #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      "zip1 v20.4s, v19.4s, v18.4s\n"
+      "zip1 v17.4s, v23.4s, v16.4s\n"
+      "zip2 v19.4s, v19.4s, v18.4s\n"
+      "zip2 v18.4s, v23.4s, v16.4s\n"
       "prfm pldl1keep, [x28, #0x70]\n"
       "prfm pldl1keep, [x27, #0x70]\n"
       "prfm pldl1keep, [x26, #0x70]\n"
       "prfm pldl1keep, [x25, #0x70]\n"
-      "zip1 v16.4s, v26.4s, v25.4s\n"
+      "zip1 v16.4s, v25.4s, v24.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
       "prfm pldl1keep, [x24, #0x70]\n"
       "prfm pldl1keep, [x23, #0x70]\n"
@@ -109,7 +109,7 @@
       "str q16, [%x[out_ptr], #0x10]\n"
       "prfm pldl1keep, [x22, #0x70]\n"
       "prfm pldl1keep, [x21, #0x70]\n"
-      "zip2 v16.4s, v26.4s, v25.4s\n"
+      "zip2 v16.4s, v25.4s, v24.4s\n"
       "str q16, [%x[out_ptr], #0x20]\n"
       "zip2 v16.4s, v20.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x30]\n"
@@ -128,40 +128,40 @@
       "tbz %x[width], #3, 7f\n"
       "ldr d28, [x28], #0x8\n"
       "ldr d27, [x27], #0x8\n"
-      "ldr d22, [x26], #0x8\n"
-      "ldr d21, [x25], #0x8\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
       "ldr d24, [x24], #0x8\n"
       "ldr d23, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d18, [x21], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d21, [x21], #0x8\n"
       "tbz %x[width], #2, 5f\n"
       "ld1 { v28.s }[2], [x28], #0x4\n"
       "ld1 { v27.s }[2], [x27], #0x4\n"
-      "ld1 { v22.s }[2], [x26], #0x4\n"
-      "ld1 { v21.s }[2], [x25], #0x4\n"
+      "ld1 { v26.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
       "ld1 { v24.s }[2], [x24], #0x4\n"
       "ld1 { v23.s }[2], [x23], #0x4\n"
-      "ld1 { v19.s }[2], [x22], #0x4\n"
-      "ld1 { v18.s }[2], [x21], #0x4\n"
+      "ld1 { v22.s }[2], [x22], #0x4\n"
+      "ld1 { v21.s }[2], [x21], #0x4\n"
       "tbz %x[width], #1, 4f\n"
       "ld1 { v28.h }[6], [x28], #0x2\n"
       "ld1 { v27.h }[6], [x27], #0x2\n"
       "mov x20, #0x4\n"
-      "ld1 { v22.h }[6], [x26], #0x2\n"
-      "ld1 { v21.h }[6], [x25], #0x2\n"
+      "ld1 { v26.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
       "ld1 { v24.h }[6], [x24], #0x2\n"
       "ld1 { v23.h }[6], [x23], #0x2\n"
-      "ld1 { v19.h }[6], [x22], #0x2\n"
-      "ld1 { v18.h }[6], [x21], #0x2\n"
+      "ld1 { v22.h }[6], [x22], #0x2\n"
+      "ld1 { v21.h }[6], [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
       "ld1 { v28.b }[14], [x28]\n"
       "ld1 { v27.b }[14], [x27]\n"
-      "ld1 { v22.b }[14], [x26]\n"
-      "ld1 { v21.b }[14], [x25]\n"
+      "ld1 { v26.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
       "ld1 { v24.b }[14], [x24]\n"
       "ld1 { v23.b }[14], [x23]\n"
-      "ld1 { v19.b }[14], [x22]\n"
-      "ld1 { v18.b }[14], [x21]\n"
+      "ld1 { v22.b }[14], [x22]\n"
+      "ld1 { v21.b }[14], [x21]\n"
       "b 11f\n"
       "4:"  // odd_loads_1_12
       "mov x20, #0x3\n"
@@ -169,33 +169,33 @@
       "ld1 { v28.b }[12], [x28]\n"
       "ld1 { v27.b }[12], [x27]\n"
       "mov x20, #0x4\n"
-      "ld1 { v22.b }[12], [x26]\n"
-      "ld1 { v21.b }[12], [x25]\n"
+      "ld1 { v26.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
       "ld1 { v24.b }[12], [x24]\n"
       "ld1 { v23.b }[12], [x23]\n"
-      "ld1 { v19.b }[12], [x22]\n"
-      "ld1 { v18.b }[12], [x21]\n"
+      "ld1 { v22.b }[12], [x22]\n"
+      "ld1 { v21.b }[12], [x21]\n"
       "b 11f\n"
       "5:"  // odd_loads_2_8
       "tbz %x[width], #1, 6f\n"
       "ld1 { v28.h }[4], [x28], #0x2\n"
       "ld1 { v27.h }[4], [x27], #0x2\n"
       "mov x20, #0x3\n"
-      "ld1 { v22.h }[4], [x26], #0x2\n"
-      "ld1 { v21.h }[4], [x25], #0x2\n"
+      "ld1 { v26.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
       "ld1 { v24.h }[4], [x24], #0x2\n"
       "ld1 { v23.h }[4], [x23], #0x2\n"
-      "ld1 { v19.h }[4], [x22], #0x2\n"
-      "ld1 { v18.h }[4], [x21], #0x2\n"
+      "ld1 { v22.h }[4], [x22], #0x2\n"
+      "ld1 { v21.h }[4], [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
       "ld1 { v28.b }[10], [x28]\n"
       "ld1 { v27.b }[10], [x27]\n"
-      "ld1 { v22.b }[10], [x26]\n"
-      "ld1 { v21.b }[10], [x25]\n"
+      "ld1 { v26.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
       "ld1 { v24.b }[10], [x24]\n"
       "ld1 { v23.b }[10], [x23]\n"
-      "ld1 { v19.b }[10], [x22]\n"
-      "ld1 { v18.b }[10], [x21]\n"
+      "ld1 { v22.b }[10], [x22]\n"
+      "ld1 { v21.b }[10], [x21]\n"
       "b 11f\n"
       "6:"  // odd_loads_1_8
       "mov x20, #0x2\n"
@@ -203,42 +203,42 @@
       "ld1 { v28.b }[8], [x28]\n"
       "ld1 { v27.b }[8], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v22.b }[8], [x26]\n"
-      "ld1 { v21.b }[8], [x25]\n"
+      "ld1 { v26.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
       "ld1 { v24.b }[8], [x24]\n"
       "ld1 { v23.b }[8], [x23]\n"
-      "ld1 { v19.b }[8], [x22]\n"
-      "ld1 { v18.b }[8], [x21]\n"
+      "ld1 { v22.b }[8], [x22]\n"
+      "ld1 { v21.b }[8], [x21]\n"
       "b 11f\n"
       "7:"  // odd_loads_4_0
       "tbz %x[width], #2, 9f\n"
       "ldr s28, [x28], #0x4\n"
       "ldr s27, [x27], #0x4\n"
-      "ldr s22, [x26], #0x4\n"
-      "ldr s21, [x25], #0x4\n"
+      "ldr s26, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
       "ldr s24, [x24], #0x4\n"
       "ldr s23, [x23], #0x4\n"
-      "ldr s19, [x22], #0x4\n"
-      "ldr s18, [x21], #0x4\n"
+      "ldr s22, [x22], #0x4\n"
+      "ldr s21, [x21], #0x4\n"
       "tbz %x[width], #1, 8f\n"
       "ld1 { v28.h }[2], [x28], #0x2\n"
       "ld1 { v27.h }[2], [x27], #0x2\n"
       "mov x20, #0x2\n"
-      "ld1 { v22.h }[2], [x26], #0x2\n"
-      "ld1 { v21.h }[2], [x25], #0x2\n"
+      "ld1 { v26.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
       "ld1 { v24.h }[2], [x24], #0x2\n"
       "ld1 { v23.h }[2], [x23], #0x2\n"
-      "ld1 { v19.h }[2], [x22], #0x2\n"
-      "ld1 { v18.h }[2], [x21], #0x2\n"
+      "ld1 { v22.h }[2], [x22], #0x2\n"
+      "ld1 { v21.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
       "ld1 { v28.b }[6], [x28]\n"
       "ld1 { v27.b }[6], [x27]\n"
-      "ld1 { v22.b }[6], [x26]\n"
-      "ld1 { v21.b }[6], [x25]\n"
+      "ld1 { v26.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
       "ld1 { v24.b }[6], [x24]\n"
       "ld1 { v23.b }[6], [x23]\n"
-      "ld1 { v19.b }[6], [x22]\n"
-      "ld1 { v18.b }[6], [x21]\n"
+      "ld1 { v22.b }[6], [x22]\n"
+      "ld1 { v21.b }[6], [x21]\n"
       "b 11f\n"
       "8:"  // odd_loads_1_4
       "mov x20, #0x1\n"
@@ -246,81 +246,80 @@
       "ld1 { v28.b }[4], [x28]\n"
       "ld1 { v27.b }[4], [x27]\n"
       "mov x20, #0x2\n"
-      "ld1 { v22.b }[4], [x26]\n"
-      "ld1 { v21.b }[4], [x25]\n"
+      "ld1 { v26.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
       "ld1 { v24.b }[4], [x24]\n"
       "ld1 { v23.b }[4], [x23]\n"
-      "ld1 { v19.b }[4], [x22]\n"
-      "ld1 { v18.b }[4], [x21]\n"
+      "ld1 { v22.b }[4], [x22]\n"
+      "ld1 { v21.b }[4], [x21]\n"
       "b 11f\n"
       "9:"  // odd_loads_2_0
       "tbz %x[width], #1, 10f\n"
       "ldr h28, [x28], #0x2\n"
       "ldr h27, [x27], #0x2\n"
       "mov x20, #0x1\n"
-      "ldr h22, [x26], #0x2\n"
-      "ldr h21, [x25], #0x2\n"
+      "ldr h26, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
       "ldr h24, [x24], #0x2\n"
       "ldr h23, [x23], #0x2\n"
-      "ldr h19, [x22], #0x2\n"
-      "ldr h18, [x21], #0x2\n"
+      "ldr h22, [x22], #0x2\n"
+      "ldr h21, [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
       "ld1 { v28.b }[2], [x28]\n"
       "ld1 { v27.b }[2], [x27]\n"
-      "ld1 { v22.b }[2], [x26]\n"
-      "ld1 { v21.b }[2], [x25]\n"
+      "ld1 { v26.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
       "ld1 { v24.b }[2], [x24]\n"
       "ld1 { v23.b }[2], [x23]\n"
-      "ld1 { v19.b }[2], [x22]\n"
-      "ld1 { v18.b }[2], [x21]\n"
+      "ld1 { v22.b }[2], [x22]\n"
+      "ld1 { v21.b }[2], [x21]\n"
       "b 11f\n"
       "10:"  // odd_loads_1_0
       "ldr b28, [x28, #0x0]\n"
       "ldr b27, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr b22, [x26, #0x0]\n"
-      "ldr b21, [x25, #0x0]\n"
+      "ldr b26, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
       "ldr b24, [x24, #0x0]\n"
       "ldr b23, [x23, #0x0]\n"
-      "ldr b19, [x22, #0x0]\n"
-      "ldr b18, [x21, #0x0]\n"
+      "ldr b22, [x22, #0x0]\n"
+      "ldr b21, [x21, #0x0]\n"
       "11:"  // Odd load end
-      "zip1 v26.4s, v28.4s, v22.4s\n"
-      "zip1 v25.4s, v27.4s, v21.4s\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "zip1 v19.4s, v27.4s, v25.4s\n"
       "subs x20, x20, #0x1\n"
-      "zip1 v20.4s, v24.4s, v19.4s\n"
-      "zip1 v17.4s, v23.4s, v18.4s\n"
-      "zip1 v16.4s, v26.4s, v25.4s\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip1 v16.4s, v20.4s, v17.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 12f\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v16.4s, v26.4s, v25.4s\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip2 v16.4s, v20.4s, v17.4s\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 12f\n"
-      "zip2 v22.4s, v28.4s, v22.4s\n"
-      "zip2 v21.4s, v27.4s, v21.4s\n"
+      "zip2 v20.4s, v28.4s, v26.4s\n"
+      "zip2 v19.4s, v27.4s, v25.4s\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v19.4s, v24.4s, v19.4s\n"
-      "zip2 v18.4s, v23.4s, v18.4s\n"
-      "zip1 v16.4s, v22.4s, v21.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 12f\n"
-      "zip2 v17.4s, v22.4s, v21.4s\n"
-      "str q17, [%x[out_ptr], #0x0]\n"
-      "zip2 v16.4s, v19.4s, v18.4s\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
       "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "12:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
       : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
index 2db5412..6c41b5f 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
@@ -153,202 +153,202 @@
       "5:"  // Main loop skip
       "cbz %x[width], 14f\n"
       "tbz %x[width], #3, 9f\n"
-      "ldr d30, [x28], #0x8\n"
-      "ldr d29, [x27], #0x8\n"
-      "ldr d28, [x26], #0x8\n"
-      "ldr d27, [x25], #0x8\n"
-      "ldr d20, [x24], #0x8\n"
-      "ldr d26, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d18, [x21], #0x8\n"
+      "ldr d29, [x28], #0x8\n"
+      "ldr d28, [x27], #0x8\n"
+      "ldr d27, [x26], #0x8\n"
+      "ldr d26, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d22, [x21], #0x8\n"
       "tbz %x[width], #2, 7f\n"
-      "ld1 { v30.s }[2], [x28], #0x4\n"
-      "ld1 { v29.s }[2], [x27], #0x4\n"
-      "ld1 { v28.s }[2], [x26], #0x4\n"
-      "ld1 { v27.s }[2], [x25], #0x4\n"
-      "ld1 { v20.s }[2], [x24], #0x4\n"
-      "ld1 { v26.s }[2], [x23], #0x4\n"
-      "ld1 { v19.s }[2], [x22], #0x4\n"
-      "ld1 { v18.s }[2], [x21], #0x4\n"
+      "ld1 { v29.s }[2], [x28], #0x4\n"
+      "ld1 { v28.s }[2], [x27], #0x4\n"
+      "ld1 { v27.s }[2], [x26], #0x4\n"
+      "ld1 { v26.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ld1 { v23.s }[2], [x22], #0x4\n"
+      "ld1 { v22.s }[2], [x21], #0x4\n"
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v30.h }[6], [x28], #0x2\n"
-      "ld1 { v29.h }[6], [x27], #0x2\n"
+      "ld1 { v29.h }[6], [x28], #0x2\n"
+      "ld1 { v28.h }[6], [x27], #0x2\n"
       "mov x20, #0x4\n"
-      "ld1 { v28.h }[6], [x26], #0x2\n"
-      "ld1 { v27.h }[6], [x25], #0x2\n"
-      "ld1 { v20.h }[6], [x24], #0x2\n"
-      "ld1 { v26.h }[6], [x23], #0x2\n"
-      "ld1 { v19.h }[6], [x22], #0x2\n"
-      "ld1 { v18.h }[6], [x21], #0x2\n"
+      "ld1 { v27.h }[6], [x26], #0x2\n"
+      "ld1 { v26.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v24.h }[6], [x23], #0x2\n"
+      "ld1 { v23.h }[6], [x22], #0x2\n"
+      "ld1 { v22.h }[6], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v30.b }[14], [x28]\n"
-      "ld1 { v29.b }[14], [x27]\n"
-      "ld1 { v28.b }[14], [x26]\n"
-      "ld1 { v27.b }[14], [x25]\n"
-      "ld1 { v20.b }[14], [x24]\n"
-      "ld1 { v26.b }[14], [x23]\n"
-      "ld1 { v19.b }[14], [x22]\n"
-      "ld1 { v18.b }[14], [x21]\n"
+      "ld1 { v29.b }[14], [x28]\n"
+      "ld1 { v28.b }[14], [x27]\n"
+      "ld1 { v27.b }[14], [x26]\n"
+      "ld1 { v26.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v24.b }[14], [x23]\n"
+      "ld1 { v23.b }[14], [x22]\n"
+      "ld1 { v22.b }[14], [x21]\n"
       "b 13f\n"
       "6:"  // odd_loads_1_12
       "mov x20, #0x3\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v30.b }[12], [x28]\n"
-      "ld1 { v29.b }[12], [x27]\n"
+      "ld1 { v29.b }[12], [x28]\n"
+      "ld1 { v28.b }[12], [x27]\n"
       "mov x20, #0x4\n"
-      "ld1 { v28.b }[12], [x26]\n"
-      "ld1 { v27.b }[12], [x25]\n"
-      "ld1 { v20.b }[12], [x24]\n"
-      "ld1 { v26.b }[12], [x23]\n"
-      "ld1 { v19.b }[12], [x22]\n"
-      "ld1 { v18.b }[12], [x21]\n"
+      "ld1 { v27.b }[12], [x26]\n"
+      "ld1 { v26.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v24.b }[12], [x23]\n"
+      "ld1 { v23.b }[12], [x22]\n"
+      "ld1 { v22.b }[12], [x21]\n"
       "b 13f\n"
       "7:"  // odd_loads_2_8
       "tbz %x[width], #1, 8f\n"
-      "ld1 { v30.h }[4], [x28], #0x2\n"
-      "ld1 { v29.h }[4], [x27], #0x2\n"
+      "ld1 { v29.h }[4], [x28], #0x2\n"
+      "ld1 { v28.h }[4], [x27], #0x2\n"
       "mov x20, #0x3\n"
-      "ld1 { v28.h }[4], [x26], #0x2\n"
-      "ld1 { v27.h }[4], [x25], #0x2\n"
-      "ld1 { v20.h }[4], [x24], #0x2\n"
-      "ld1 { v26.h }[4], [x23], #0x2\n"
-      "ld1 { v19.h }[4], [x22], #0x2\n"
-      "ld1 { v18.h }[4], [x21], #0x2\n"
+      "ld1 { v27.h }[4], [x26], #0x2\n"
+      "ld1 { v26.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v24.h }[4], [x23], #0x2\n"
+      "ld1 { v23.h }[4], [x22], #0x2\n"
+      "ld1 { v22.h }[4], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v30.b }[10], [x28]\n"
-      "ld1 { v29.b }[10], [x27]\n"
-      "ld1 { v28.b }[10], [x26]\n"
-      "ld1 { v27.b }[10], [x25]\n"
-      "ld1 { v20.b }[10], [x24]\n"
-      "ld1 { v26.b }[10], [x23]\n"
-      "ld1 { v19.b }[10], [x22]\n"
-      "ld1 { v18.b }[10], [x21]\n"
+      "ld1 { v29.b }[10], [x28]\n"
+      "ld1 { v28.b }[10], [x27]\n"
+      "ld1 { v27.b }[10], [x26]\n"
+      "ld1 { v26.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v24.b }[10], [x23]\n"
+      "ld1 { v23.b }[10], [x22]\n"
+      "ld1 { v22.b }[10], [x21]\n"
       "b 13f\n"
       "8:"  // odd_loads_1_8
       "mov x20, #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v30.b }[8], [x28]\n"
-      "ld1 { v29.b }[8], [x27]\n"
+      "ld1 { v29.b }[8], [x28]\n"
+      "ld1 { v28.b }[8], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v28.b }[8], [x26]\n"
-      "ld1 { v27.b }[8], [x25]\n"
-      "ld1 { v20.b }[8], [x24]\n"
-      "ld1 { v26.b }[8], [x23]\n"
-      "ld1 { v19.b }[8], [x22]\n"
-      "ld1 { v18.b }[8], [x21]\n"
+      "ld1 { v27.b }[8], [x26]\n"
+      "ld1 { v26.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v24.b }[8], [x23]\n"
+      "ld1 { v23.b }[8], [x22]\n"
+      "ld1 { v22.b }[8], [x21]\n"
       "b 13f\n"
       "9:"  // odd_loads_4_0
       "tbz %x[width], #2, 11f\n"
-      "ldr s30, [x28], #0x4\n"
-      "ldr s29, [x27], #0x4\n"
-      "ldr s28, [x26], #0x4\n"
-      "ldr s27, [x25], #0x4\n"
-      "ldr s20, [x24], #0x4\n"
-      "ldr s26, [x23], #0x4\n"
-      "ldr s19, [x22], #0x4\n"
-      "ldr s18, [x21], #0x4\n"
+      "ldr s29, [x28], #0x4\n"
+      "ldr s28, [x27], #0x4\n"
+      "ldr s27, [x26], #0x4\n"
+      "ldr s26, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s23, [x22], #0x4\n"
+      "ldr s22, [x21], #0x4\n"
       "tbz %x[width], #1, 10f\n"
-      "ld1 { v30.h }[2], [x28], #0x2\n"
-      "ld1 { v29.h }[2], [x27], #0x2\n"
+      "ld1 { v29.h }[2], [x28], #0x2\n"
+      "ld1 { v28.h }[2], [x27], #0x2\n"
       "mov x20, #0x2\n"
-      "ld1 { v28.h }[2], [x26], #0x2\n"
-      "ld1 { v27.h }[2], [x25], #0x2\n"
-      "ld1 { v20.h }[2], [x24], #0x2\n"
-      "ld1 { v26.h }[2], [x23], #0x2\n"
-      "ld1 { v19.h }[2], [x22], #0x2\n"
-      "ld1 { v18.h }[2], [x21], #0x2\n"
+      "ld1 { v27.h }[2], [x26], #0x2\n"
+      "ld1 { v26.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v24.h }[2], [x23], #0x2\n"
+      "ld1 { v23.h }[2], [x22], #0x2\n"
+      "ld1 { v22.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v30.b }[6], [x28]\n"
-      "ld1 { v29.b }[6], [x27]\n"
-      "ld1 { v28.b }[6], [x26]\n"
-      "ld1 { v27.b }[6], [x25]\n"
-      "ld1 { v20.b }[6], [x24]\n"
-      "ld1 { v26.b }[6], [x23]\n"
-      "ld1 { v19.b }[6], [x22]\n"
-      "ld1 { v18.b }[6], [x21]\n"
+      "ld1 { v29.b }[6], [x28]\n"
+      "ld1 { v28.b }[6], [x27]\n"
+      "ld1 { v27.b }[6], [x26]\n"
+      "ld1 { v26.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v24.b }[6], [x23]\n"
+      "ld1 { v23.b }[6], [x22]\n"
+      "ld1 { v22.b }[6], [x21]\n"
       "b 13f\n"
       "10:"  // odd_loads_1_4
       "mov x20, #0x1\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v30.b }[4], [x28]\n"
-      "ld1 { v29.b }[4], [x27]\n"
+      "ld1 { v29.b }[4], [x28]\n"
+      "ld1 { v28.b }[4], [x27]\n"
       "mov x20, #0x2\n"
-      "ld1 { v28.b }[4], [x26]\n"
-      "ld1 { v27.b }[4], [x25]\n"
-      "ld1 { v20.b }[4], [x24]\n"
-      "ld1 { v26.b }[4], [x23]\n"
-      "ld1 { v19.b }[4], [x22]\n"
-      "ld1 { v18.b }[4], [x21]\n"
+      "ld1 { v27.b }[4], [x26]\n"
+      "ld1 { v26.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v24.b }[4], [x23]\n"
+      "ld1 { v23.b }[4], [x22]\n"
+      "ld1 { v22.b }[4], [x21]\n"
       "b 13f\n"
       "11:"  // odd_loads_2_0
       "tbz %x[width], #1, 12f\n"
-      "ldr h30, [x28], #0x2\n"
-      "ldr h29, [x27], #0x2\n"
+      "ldr h29, [x28], #0x2\n"
+      "ldr h28, [x27], #0x2\n"
       "mov x20, #0x1\n"
-      "ldr h28, [x26], #0x2\n"
-      "ldr h27, [x25], #0x2\n"
-      "ldr h20, [x24], #0x2\n"
-      "ldr h26, [x23], #0x2\n"
-      "ldr h19, [x22], #0x2\n"
-      "ldr h18, [x21], #0x2\n"
+      "ldr h27, [x26], #0x2\n"
+      "ldr h26, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h24, [x23], #0x2\n"
+      "ldr h23, [x22], #0x2\n"
+      "ldr h22, [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v30.b }[2], [x28]\n"
-      "ld1 { v29.b }[2], [x27]\n"
-      "ld1 { v28.b }[2], [x26]\n"
-      "ld1 { v27.b }[2], [x25]\n"
-      "ld1 { v20.b }[2], [x24]\n"
-      "ld1 { v26.b }[2], [x23]\n"
-      "ld1 { v19.b }[2], [x22]\n"
-      "ld1 { v18.b }[2], [x21]\n"
+      "ld1 { v29.b }[2], [x28]\n"
+      "ld1 { v28.b }[2], [x27]\n"
+      "ld1 { v27.b }[2], [x26]\n"
+      "ld1 { v26.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v24.b }[2], [x23]\n"
+      "ld1 { v23.b }[2], [x22]\n"
+      "ld1 { v22.b }[2], [x21]\n"
       "b 13f\n"
       "12:"  // odd_loads_1_0
-      "ldr b30, [x28, #0x0]\n"
-      "ldr b29, [x27, #0x0]\n"
+      "ldr b29, [x28, #0x0]\n"
+      "ldr b28, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr b28, [x26, #0x0]\n"
-      "ldr b27, [x25, #0x0]\n"
-      "ldr b20, [x24, #0x0]\n"
-      "ldr b26, [x23, #0x0]\n"
-      "ldr b19, [x22, #0x0]\n"
-      "ldr b18, [x21, #0x0]\n"
+      "ldr b27, [x26, #0x0]\n"
+      "ldr b26, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b24, [x23, #0x0]\n"
+      "ldr b23, [x22, #0x0]\n"
+      "ldr b22, [x21, #0x0]\n"
       "13:"  // Odd load end
-      "zip1 v22.4s, v30.4s, v28.4s\n"
       "zip1 v21.4s, v29.4s, v27.4s\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
       "subs x20, x20, #0x1\n"
-      "zip1 v17.4s, v20.4s, v19.4s\n"
-      "zip1 v16.4s, v26.4s, v18.4s\n"
-      "zip1 v25.4s, v22.4s, v21.4s\n"
-      "zip1 v24.4s, v17.4s, v16.4s\n"
-      "str q25, [%x[out_ptr], #0x0]\n"
-      "sadalp v2.8h, v25.16b\n"
-      "str q24, [%x[out_ptr], #0x10]\n"
-      "sadalp v1.8h, v24.16b\n"
+      "zip1 v19.4s, v25.4s, v23.4s\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v21.4s, v20.4s\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "sadalp v2.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "sadalp v1.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 14f\n"
-      "zip2 v23.4s, v22.4s, v21.4s\n"
-      "zip2 v22.4s, v17.4s, v16.4s\n"
+      "zip2 v17.4s, v21.4s, v20.4s\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
       "subs x20, x20, #0x1\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      "sadalp v2.8h, v23.16b\n"
-      "str q22, [%x[out_ptr], #0x10]\n"
-      "sadalp v1.8h, v22.16b\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "sadalp v2.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "sadalp v1.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 14f\n"
-      "zip2 v21.4s, v30.4s, v28.4s\n"
-      "zip2 v17.4s, v29.4s, v27.4s\n"
+      "zip2 v21.4s, v29.4s, v27.4s\n"
+      "zip2 v20.4s, v28.4s, v26.4s\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v20.4s, v20.4s, v19.4s\n"
-      "zip2 v16.4s, v26.4s, v18.4s\n"
-      "zip1 v19.4s, v21.4s, v17.4s\n"
-      "zip1 v18.4s, v20.4s, v16.4s\n"
-      "str q19, [%x[out_ptr], #0x0]\n"
-      "sadalp v2.8h, v19.16b\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
-      "sadalp v1.8h, v18.16b\n"
+      "zip2 v19.4s, v25.4s, v23.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v21.4s, v20.4s\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "sadalp v2.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "sadalp v1.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 14f\n"
-      "zip2 v17.4s, v21.4s, v17.4s\n"
-      "zip2 v16.4s, v20.4s, v16.4s\n"
+      "zip2 v17.4s, v21.4s, v20.4s\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
       "str q17, [%x[out_ptr], #0x0]\n"
       "sadalp v2.8h, v17.16b\n"
       "str q16, [%x[out_ptr], #0x10]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
index 44a79c0..17eb7d5 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
@@ -153,202 +153,202 @@
       "5:"  // Main loop skip
       "cbz %x[width], 14f\n"
       "tbz %x[width], #3, 9f\n"
-      "ldr d30, [x28], #0x8\n"
-      "ldr d29, [x27], #0x8\n"
-      "ldr d28, [x26], #0x8\n"
-      "ldr d27, [x25], #0x8\n"
-      "ldr d20, [x24], #0x8\n"
-      "ldr d26, [x23], #0x8\n"
-      "ldr d19, [x22], #0x8\n"
-      "ldr d18, [x21], #0x8\n"
+      "ldr d29, [x28], #0x8\n"
+      "ldr d28, [x27], #0x8\n"
+      "ldr d27, [x26], #0x8\n"
+      "ldr d26, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d22, [x21], #0x8\n"
       "tbz %x[width], #2, 7f\n"
-      "ld1 { v30.s }[2], [x28], #0x4\n"
-      "ld1 { v29.s }[2], [x27], #0x4\n"
-      "ld1 { v28.s }[2], [x26], #0x4\n"
-      "ld1 { v27.s }[2], [x25], #0x4\n"
-      "ld1 { v20.s }[2], [x24], #0x4\n"
-      "ld1 { v26.s }[2], [x23], #0x4\n"
-      "ld1 { v19.s }[2], [x22], #0x4\n"
-      "ld1 { v18.s }[2], [x21], #0x4\n"
+      "ld1 { v29.s }[2], [x28], #0x4\n"
+      "ld1 { v28.s }[2], [x27], #0x4\n"
+      "ld1 { v27.s }[2], [x26], #0x4\n"
+      "ld1 { v26.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ld1 { v23.s }[2], [x22], #0x4\n"
+      "ld1 { v22.s }[2], [x21], #0x4\n"
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v30.h }[6], [x28], #0x2\n"
-      "ld1 { v29.h }[6], [x27], #0x2\n"
+      "ld1 { v29.h }[6], [x28], #0x2\n"
+      "ld1 { v28.h }[6], [x27], #0x2\n"
       "mov x20, #0x4\n"
-      "ld1 { v28.h }[6], [x26], #0x2\n"
-      "ld1 { v27.h }[6], [x25], #0x2\n"
-      "ld1 { v20.h }[6], [x24], #0x2\n"
-      "ld1 { v26.h }[6], [x23], #0x2\n"
-      "ld1 { v19.h }[6], [x22], #0x2\n"
-      "ld1 { v18.h }[6], [x21], #0x2\n"
+      "ld1 { v27.h }[6], [x26], #0x2\n"
+      "ld1 { v26.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v24.h }[6], [x23], #0x2\n"
+      "ld1 { v23.h }[6], [x22], #0x2\n"
+      "ld1 { v22.h }[6], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v30.b }[14], [x28]\n"
-      "ld1 { v29.b }[14], [x27]\n"
-      "ld1 { v28.b }[14], [x26]\n"
-      "ld1 { v27.b }[14], [x25]\n"
-      "ld1 { v20.b }[14], [x24]\n"
-      "ld1 { v26.b }[14], [x23]\n"
-      "ld1 { v19.b }[14], [x22]\n"
-      "ld1 { v18.b }[14], [x21]\n"
+      "ld1 { v29.b }[14], [x28]\n"
+      "ld1 { v28.b }[14], [x27]\n"
+      "ld1 { v27.b }[14], [x26]\n"
+      "ld1 { v26.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v24.b }[14], [x23]\n"
+      "ld1 { v23.b }[14], [x22]\n"
+      "ld1 { v22.b }[14], [x21]\n"
       "b 13f\n"
       "6:"  // odd_loads_1_12
       "mov x20, #0x3\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v30.b }[12], [x28]\n"
-      "ld1 { v29.b }[12], [x27]\n"
+      "ld1 { v29.b }[12], [x28]\n"
+      "ld1 { v28.b }[12], [x27]\n"
       "mov x20, #0x4\n"
-      "ld1 { v28.b }[12], [x26]\n"
-      "ld1 { v27.b }[12], [x25]\n"
-      "ld1 { v20.b }[12], [x24]\n"
-      "ld1 { v26.b }[12], [x23]\n"
-      "ld1 { v19.b }[12], [x22]\n"
-      "ld1 { v18.b }[12], [x21]\n"
+      "ld1 { v27.b }[12], [x26]\n"
+      "ld1 { v26.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v24.b }[12], [x23]\n"
+      "ld1 { v23.b }[12], [x22]\n"
+      "ld1 { v22.b }[12], [x21]\n"
       "b 13f\n"
       "7:"  // odd_loads_2_8
       "tbz %x[width], #1, 8f\n"
-      "ld1 { v30.h }[4], [x28], #0x2\n"
-      "ld1 { v29.h }[4], [x27], #0x2\n"
+      "ld1 { v29.h }[4], [x28], #0x2\n"
+      "ld1 { v28.h }[4], [x27], #0x2\n"
       "mov x20, #0x3\n"
-      "ld1 { v28.h }[4], [x26], #0x2\n"
-      "ld1 { v27.h }[4], [x25], #0x2\n"
-      "ld1 { v20.h }[4], [x24], #0x2\n"
-      "ld1 { v26.h }[4], [x23], #0x2\n"
-      "ld1 { v19.h }[4], [x22], #0x2\n"
-      "ld1 { v18.h }[4], [x21], #0x2\n"
+      "ld1 { v27.h }[4], [x26], #0x2\n"
+      "ld1 { v26.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v24.h }[4], [x23], #0x2\n"
+      "ld1 { v23.h }[4], [x22], #0x2\n"
+      "ld1 { v22.h }[4], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v30.b }[10], [x28]\n"
-      "ld1 { v29.b }[10], [x27]\n"
-      "ld1 { v28.b }[10], [x26]\n"
-      "ld1 { v27.b }[10], [x25]\n"
-      "ld1 { v20.b }[10], [x24]\n"
-      "ld1 { v26.b }[10], [x23]\n"
-      "ld1 { v19.b }[10], [x22]\n"
-      "ld1 { v18.b }[10], [x21]\n"
+      "ld1 { v29.b }[10], [x28]\n"
+      "ld1 { v28.b }[10], [x27]\n"
+      "ld1 { v27.b }[10], [x26]\n"
+      "ld1 { v26.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v24.b }[10], [x23]\n"
+      "ld1 { v23.b }[10], [x22]\n"
+      "ld1 { v22.b }[10], [x21]\n"
       "b 13f\n"
       "8:"  // odd_loads_1_8
       "mov x20, #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v30.b }[8], [x28]\n"
-      "ld1 { v29.b }[8], [x27]\n"
+      "ld1 { v29.b }[8], [x28]\n"
+      "ld1 { v28.b }[8], [x27]\n"
       "mov x20, #0x3\n"
-      "ld1 { v28.b }[8], [x26]\n"
-      "ld1 { v27.b }[8], [x25]\n"
-      "ld1 { v20.b }[8], [x24]\n"
-      "ld1 { v26.b }[8], [x23]\n"
-      "ld1 { v19.b }[8], [x22]\n"
-      "ld1 { v18.b }[8], [x21]\n"
+      "ld1 { v27.b }[8], [x26]\n"
+      "ld1 { v26.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v24.b }[8], [x23]\n"
+      "ld1 { v23.b }[8], [x22]\n"
+      "ld1 { v22.b }[8], [x21]\n"
       "b 13f\n"
       "9:"  // odd_loads_4_0
       "tbz %x[width], #2, 11f\n"
-      "ldr s30, [x28], #0x4\n"
-      "ldr s29, [x27], #0x4\n"
-      "ldr s28, [x26], #0x4\n"
-      "ldr s27, [x25], #0x4\n"
-      "ldr s20, [x24], #0x4\n"
-      "ldr s26, [x23], #0x4\n"
-      "ldr s19, [x22], #0x4\n"
-      "ldr s18, [x21], #0x4\n"
+      "ldr s29, [x28], #0x4\n"
+      "ldr s28, [x27], #0x4\n"
+      "ldr s27, [x26], #0x4\n"
+      "ldr s26, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s23, [x22], #0x4\n"
+      "ldr s22, [x21], #0x4\n"
       "tbz %x[width], #1, 10f\n"
-      "ld1 { v30.h }[2], [x28], #0x2\n"
-      "ld1 { v29.h }[2], [x27], #0x2\n"
+      "ld1 { v29.h }[2], [x28], #0x2\n"
+      "ld1 { v28.h }[2], [x27], #0x2\n"
       "mov x20, #0x2\n"
-      "ld1 { v28.h }[2], [x26], #0x2\n"
-      "ld1 { v27.h }[2], [x25], #0x2\n"
-      "ld1 { v20.h }[2], [x24], #0x2\n"
-      "ld1 { v26.h }[2], [x23], #0x2\n"
-      "ld1 { v19.h }[2], [x22], #0x2\n"
-      "ld1 { v18.h }[2], [x21], #0x2\n"
+      "ld1 { v27.h }[2], [x26], #0x2\n"
+      "ld1 { v26.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v24.h }[2], [x23], #0x2\n"
+      "ld1 { v23.h }[2], [x22], #0x2\n"
+      "ld1 { v22.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v30.b }[6], [x28]\n"
-      "ld1 { v29.b }[6], [x27]\n"
-      "ld1 { v28.b }[6], [x26]\n"
-      "ld1 { v27.b }[6], [x25]\n"
-      "ld1 { v20.b }[6], [x24]\n"
-      "ld1 { v26.b }[6], [x23]\n"
-      "ld1 { v19.b }[6], [x22]\n"
-      "ld1 { v18.b }[6], [x21]\n"
+      "ld1 { v29.b }[6], [x28]\n"
+      "ld1 { v28.b }[6], [x27]\n"
+      "ld1 { v27.b }[6], [x26]\n"
+      "ld1 { v26.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v24.b }[6], [x23]\n"
+      "ld1 { v23.b }[6], [x22]\n"
+      "ld1 { v22.b }[6], [x21]\n"
       "b 13f\n"
       "10:"  // odd_loads_1_4
       "mov x20, #0x1\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v30.b }[4], [x28]\n"
-      "ld1 { v29.b }[4], [x27]\n"
+      "ld1 { v29.b }[4], [x28]\n"
+      "ld1 { v28.b }[4], [x27]\n"
       "mov x20, #0x2\n"
-      "ld1 { v28.b }[4], [x26]\n"
-      "ld1 { v27.b }[4], [x25]\n"
-      "ld1 { v20.b }[4], [x24]\n"
-      "ld1 { v26.b }[4], [x23]\n"
-      "ld1 { v19.b }[4], [x22]\n"
-      "ld1 { v18.b }[4], [x21]\n"
+      "ld1 { v27.b }[4], [x26]\n"
+      "ld1 { v26.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v24.b }[4], [x23]\n"
+      "ld1 { v23.b }[4], [x22]\n"
+      "ld1 { v22.b }[4], [x21]\n"
       "b 13f\n"
       "11:"  // odd_loads_2_0
       "tbz %x[width], #1, 12f\n"
-      "ldr h30, [x28], #0x2\n"
-      "ldr h29, [x27], #0x2\n"
+      "ldr h29, [x28], #0x2\n"
+      "ldr h28, [x27], #0x2\n"
       "mov x20, #0x1\n"
-      "ldr h28, [x26], #0x2\n"
-      "ldr h27, [x25], #0x2\n"
-      "ldr h20, [x24], #0x2\n"
-      "ldr h26, [x23], #0x2\n"
-      "ldr h19, [x22], #0x2\n"
-      "ldr h18, [x21], #0x2\n"
+      "ldr h27, [x26], #0x2\n"
+      "ldr h26, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h24, [x23], #0x2\n"
+      "ldr h23, [x22], #0x2\n"
+      "ldr h22, [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
-      "ld1 { v30.b }[2], [x28]\n"
-      "ld1 { v29.b }[2], [x27]\n"
-      "ld1 { v28.b }[2], [x26]\n"
-      "ld1 { v27.b }[2], [x25]\n"
-      "ld1 { v20.b }[2], [x24]\n"
-      "ld1 { v26.b }[2], [x23]\n"
-      "ld1 { v19.b }[2], [x22]\n"
-      "ld1 { v18.b }[2], [x21]\n"
+      "ld1 { v29.b }[2], [x28]\n"
+      "ld1 { v28.b }[2], [x27]\n"
+      "ld1 { v27.b }[2], [x26]\n"
+      "ld1 { v26.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v24.b }[2], [x23]\n"
+      "ld1 { v23.b }[2], [x22]\n"
+      "ld1 { v22.b }[2], [x21]\n"
       "b 13f\n"
       "12:"  // odd_loads_1_0
-      "ldr b30, [x28, #0x0]\n"
-      "ldr b29, [x27, #0x0]\n"
+      "ldr b29, [x28, #0x0]\n"
+      "ldr b28, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr b28, [x26, #0x0]\n"
-      "ldr b27, [x25, #0x0]\n"
-      "ldr b20, [x24, #0x0]\n"
-      "ldr b26, [x23, #0x0]\n"
-      "ldr b19, [x22, #0x0]\n"
-      "ldr b18, [x21, #0x0]\n"
+      "ldr b27, [x26, #0x0]\n"
+      "ldr b26, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b24, [x23, #0x0]\n"
+      "ldr b23, [x22, #0x0]\n"
+      "ldr b22, [x21, #0x0]\n"
       "13:"  // Odd load end
-      "zip1 v22.4s, v30.4s, v28.4s\n"
       "zip1 v21.4s, v29.4s, v27.4s\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
       "subs x20, x20, #0x1\n"
-      "zip1 v17.4s, v20.4s, v19.4s\n"
-      "zip1 v16.4s, v26.4s, v18.4s\n"
-      "zip1 v25.4s, v22.4s, v21.4s\n"
-      "zip1 v24.4s, v17.4s, v16.4s\n"
-      "str q25, [%x[out_ptr], #0x0]\n"
-      "uadalp v2.8h, v25.16b\n"
-      "str q24, [%x[out_ptr], #0x10]\n"
-      "uadalp v1.8h, v24.16b\n"
+      "zip1 v19.4s, v25.4s, v23.4s\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v21.4s, v20.4s\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "uadalp v2.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "uadalp v1.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 14f\n"
-      "zip2 v23.4s, v22.4s, v21.4s\n"
-      "zip2 v22.4s, v17.4s, v16.4s\n"
+      "zip2 v17.4s, v21.4s, v20.4s\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
       "subs x20, x20, #0x1\n"
-      "str q23, [%x[out_ptr], #0x0]\n"
-      "uadalp v2.8h, v23.16b\n"
-      "str q22, [%x[out_ptr], #0x10]\n"
-      "uadalp v1.8h, v22.16b\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "uadalp v2.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "uadalp v1.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 14f\n"
-      "zip2 v21.4s, v30.4s, v28.4s\n"
-      "zip2 v17.4s, v29.4s, v27.4s\n"
+      "zip2 v21.4s, v29.4s, v27.4s\n"
+      "zip2 v20.4s, v28.4s, v26.4s\n"
       "subs x20, x20, #0x1\n"
-      "zip2 v20.4s, v20.4s, v19.4s\n"
-      "zip2 v16.4s, v26.4s, v18.4s\n"
-      "zip1 v19.4s, v21.4s, v17.4s\n"
-      "zip1 v18.4s, v20.4s, v16.4s\n"
-      "str q19, [%x[out_ptr], #0x0]\n"
-      "uadalp v2.8h, v19.16b\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
-      "uadalp v1.8h, v18.16b\n"
+      "zip2 v19.4s, v25.4s, v23.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v21.4s, v20.4s\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "uadalp v2.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "uadalp v1.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       "beq 14f\n"
-      "zip2 v17.4s, v21.4s, v17.4s\n"
-      "zip2 v16.4s, v20.4s, v16.4s\n"
+      "zip2 v17.4s, v21.4s, v20.4s\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
       "str q17, [%x[out_ptr], #0x0]\n"
       "uadalp v2.8h, v17.16b\n"
       "str q16, [%x[out_ptr], #0x10]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
index 4bfb360..7b445ef 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
@@ -79,18 +79,18 @@
       "prfm pldl1keep, [x21, #0x40]\n"
       "blt 3f\n"
       "2:"  // Main loop head
-      "ldr q26, [x28], #0x10\n"
-      "ldr q21, [x27], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "ldr q19, [x27], #0x10\n"
       "subs %x[width], %x[width], #0x10\n"
       "cmp %x[width], #0x10\n"
       "ldr q25, [x26], #0x10\n"
       "ldr q24, [x25], #0x10\n"
-      "zip1 v16.2d, v26.2d, v21.2d\n"
+      "zip1 v16.2d, v20.2d, v19.2d\n"
       "zip1 v18.2d, v25.2d, v24.2d\n"
       "ldr q23, [x24], #0x10\n"
       "ldr q22, [x23], #0x10\n"
       "zip1 v17.2d, v23.2d, v22.2d\n"
-      "zip2 v21.2d, v26.2d, v21.2d\n"
+      "zip2 v21.2d, v20.2d, v19.2d\n"
       "ldr q20, [x22], #0x10\n"
       "ldr q19, [x21], #0x10\n"
       "str q16, [%x[out_ptr], #0x0]\n"
@@ -118,188 +118,187 @@
       "3:"  // Main loop skip
       "cbz %x[width], 12f\n"
       "tbz %x[width], #3, 7f\n"
-      "ldr d26, [x28], #0x8\n"
-      "ldr d21, [x27], #0x8\n"
-      "ldr d25, [x26], #0x8\n"
-      "ldr d24, [x25], #0x8\n"
-      "ldr d23, [x24], #0x8\n"
-      "ldr d22, [x23], #0x8\n"
-      "ldr d20, [x22], #0x8\n"
-      "ldr d19, [x21], #0x8\n"
+      "ldr d25, [x28], #0x8\n"
+      "ldr d24, [x27], #0x8\n"
+      "ldr d23, [x26], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d20, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d18, [x21], #0x8\n"
       "tbz %x[width], #2, 5f\n"
-      "ld1 { v26.s }[2], [x28], #0x4\n"
-      "ld1 { v21.s }[2], [x27], #0x4\n"
-      "ld1 { v25.s }[2], [x26], #0x4\n"
-      "ld1 { v24.s }[2], [x25], #0x4\n"
-      "ld1 { v23.s }[2], [x24], #0x4\n"
-      "ld1 { v22.s }[2], [x23], #0x4\n"
-      "ld1 { v20.s }[2], [x22], #0x4\n"
-      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v25.s }[2], [x28], #0x4\n"
+      "ld1 { v24.s }[2], [x27], #0x4\n"
+      "ld1 { v23.s }[2], [x26], #0x4\n"
+      "ld1 { v22.s }[2], [x25], #0x4\n"
+      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v20.s }[2], [x23], #0x4\n"
+      "ld1 { v19.s }[2], [x22], #0x4\n"
+      "ld1 { v18.s }[2], [x21], #0x4\n"
       "tbz %x[width], #1, 4f\n"
-      "ld1 { v26.h }[6], [x28], #0x2\n"
-      "ld1 { v21.h }[6], [x27], #0x2\n"
+      "ld1 { v25.h }[6], [x28], #0x2\n"
+      "ld1 { v24.h }[6], [x27], #0x2\n"
       "mov x20, #0x2\n"
-      "ld1 { v25.h }[6], [x26], #0x2\n"
-      "ld1 { v24.h }[6], [x25], #0x2\n"
-      "ld1 { v23.h }[6], [x24], #0x2\n"
-      "ld1 { v22.h }[6], [x23], #0x2\n"
-      "ld1 { v20.h }[6], [x22], #0x2\n"
-      "ld1 { v19.h }[6], [x21], #0x2\n"
+      "ld1 { v23.h }[6], [x26], #0x2\n"
+      "ld1 { v22.h }[6], [x25], #0x2\n"
+      "ld1 { v21.h }[6], [x24], #0x2\n"
+      "ld1 { v20.h }[6], [x23], #0x2\n"
+      "ld1 { v19.h }[6], [x22], #0x2\n"
+      "ld1 { v18.h }[6], [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v26.b }[14], [x28]\n"
-      "ld1 { v21.b }[14], [x27]\n"
-      "ld1 { v25.b }[14], [x26]\n"
-      "ld1 { v24.b }[14], [x25]\n"
-      "ld1 { v23.b }[14], [x24]\n"
-      "ld1 { v22.b }[14], [x23]\n"
-      "ld1 { v20.b }[14], [x22]\n"
-      "ld1 { v19.b }[14], [x21]\n"
+      "ld1 { v25.b }[14], [x28]\n"
+      "ld1 { v24.b }[14], [x27]\n"
+      "ld1 { v23.b }[14], [x26]\n"
+      "ld1 { v22.b }[14], [x25]\n"
+      "ld1 { v21.b }[14], [x24]\n"
+      "ld1 { v20.b }[14], [x23]\n"
+      "ld1 { v19.b }[14], [x22]\n"
+      "ld1 { v18.b }[14], [x21]\n"
       "b 11f\n"
       "4:"  // odd_loads_1_12
       "mov x20, #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v26.b }[12], [x28]\n"
-      "ld1 { v21.b }[12], [x27]\n"
-      "ld1 { v25.b }[12], [x26]\n"
-      "ld1 { v24.b }[12], [x25]\n"
-      "ld1 { v23.b }[12], [x24]\n"
-      "ld1 { v22.b }[12], [x23]\n"
-      "ld1 { v20.b }[12], [x22]\n"
-      "ld1 { v19.b }[12], [x21]\n"
+      "ld1 { v25.b }[12], [x28]\n"
+      "ld1 { v24.b }[12], [x27]\n"
+      "ld1 { v23.b }[12], [x26]\n"
+      "ld1 { v22.b }[12], [x25]\n"
+      "ld1 { v21.b }[12], [x24]\n"
+      "ld1 { v20.b }[12], [x23]\n"
+      "ld1 { v19.b }[12], [x22]\n"
+      "ld1 { v18.b }[12], [x21]\n"
       "b 11f\n"
       "5:"  // odd_loads_2_8
       "tbz %x[width], #1, 6f\n"
-      "ld1 { v26.h }[4], [x28], #0x2\n"
-      "ld1 { v21.h }[4], [x27], #0x2\n"
+      "ld1 { v25.h }[4], [x28], #0x2\n"
+      "ld1 { v24.h }[4], [x27], #0x2\n"
       "mov x20, #0x2\n"
-      "ld1 { v25.h }[4], [x26], #0x2\n"
-      "ld1 { v24.h }[4], [x25], #0x2\n"
-      "ld1 { v23.h }[4], [x24], #0x2\n"
-      "ld1 { v22.h }[4], [x23], #0x2\n"
-      "ld1 { v20.h }[4], [x22], #0x2\n"
-      "ld1 { v19.h }[4], [x21], #0x2\n"
+      "ld1 { v23.h }[4], [x26], #0x2\n"
+      "ld1 { v22.h }[4], [x25], #0x2\n"
+      "ld1 { v21.h }[4], [x24], #0x2\n"
+      "ld1 { v20.h }[4], [x23], #0x2\n"
+      "ld1 { v19.h }[4], [x22], #0x2\n"
+      "ld1 { v18.h }[4], [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v26.b }[10], [x28]\n"
-      "ld1 { v21.b }[10], [x27]\n"
-      "ld1 { v25.b }[10], [x26]\n"
-      "ld1 { v24.b }[10], [x25]\n"
-      "ld1 { v23.b }[10], [x24]\n"
-      "ld1 { v22.b }[10], [x23]\n"
-      "ld1 { v20.b }[10], [x22]\n"
-      "ld1 { v19.b }[10], [x21]\n"
+      "ld1 { v25.b }[10], [x28]\n"
+      "ld1 { v24.b }[10], [x27]\n"
+      "ld1 { v23.b }[10], [x26]\n"
+      "ld1 { v22.b }[10], [x25]\n"
+      "ld1 { v21.b }[10], [x24]\n"
+      "ld1 { v20.b }[10], [x23]\n"
+      "ld1 { v19.b }[10], [x22]\n"
+      "ld1 { v18.b }[10], [x21]\n"
       "b 11f\n"
       "6:"  // odd_loads_1_8
       "mov x20, #0x1\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v26.b }[8], [x28]\n"
-      "ld1 { v21.b }[8], [x27]\n"
+      "ld1 { v25.b }[8], [x28]\n"
+      "ld1 { v24.b }[8], [x27]\n"
       "mov x20, #0x2\n"
-      "ld1 { v25.b }[8], [x26]\n"
-      "ld1 { v24.b }[8], [x25]\n"
-      "ld1 { v23.b }[8], [x24]\n"
-      "ld1 { v22.b }[8], [x23]\n"
-      "ld1 { v20.b }[8], [x22]\n"
-      "ld1 { v19.b }[8], [x21]\n"
+      "ld1 { v23.b }[8], [x26]\n"
+      "ld1 { v22.b }[8], [x25]\n"
+      "ld1 { v21.b }[8], [x24]\n"
+      "ld1 { v20.b }[8], [x23]\n"
+      "ld1 { v19.b }[8], [x22]\n"
+      "ld1 { v18.b }[8], [x21]\n"
       "b 11f\n"
       "7:"  // odd_loads_4_0
       "tbz %x[width], #2, 9f\n"
-      "ldr s26, [x28], #0x4\n"
-      "ldr s21, [x27], #0x4\n"
-      "ldr s25, [x26], #0x4\n"
-      "ldr s24, [x25], #0x4\n"
-      "ldr s23, [x24], #0x4\n"
-      "ldr s22, [x23], #0x4\n"
-      "ldr s20, [x22], #0x4\n"
-      "ldr s19, [x21], #0x4\n"
+      "ldr s25, [x28], #0x4\n"
+      "ldr s24, [x27], #0x4\n"
+      "ldr s23, [x26], #0x4\n"
+      "ldr s22, [x25], #0x4\n"
+      "ldr s21, [x24], #0x4\n"
+      "ldr s20, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "ldr s18, [x21], #0x4\n"
       "tbz %x[width], #1, 8f\n"
-      "ld1 { v26.h }[2], [x28], #0x2\n"
-      "ld1 { v21.h }[2], [x27], #0x2\n"
+      "ld1 { v25.h }[2], [x28], #0x2\n"
+      "ld1 { v24.h }[2], [x27], #0x2\n"
       "mov x20, #0x1\n"
-      "ld1 { v25.h }[2], [x26], #0x2\n"
-      "ld1 { v24.h }[2], [x25], #0x2\n"
-      "ld1 { v23.h }[2], [x24], #0x2\n"
-      "ld1 { v22.h }[2], [x23], #0x2\n"
-      "ld1 { v20.h }[2], [x22], #0x2\n"
-      "ld1 { v19.h }[2], [x21], #0x2\n"
+      "ld1 { v23.h }[2], [x26], #0x2\n"
+      "ld1 { v22.h }[2], [x25], #0x2\n"
+      "ld1 { v21.h }[2], [x24], #0x2\n"
+      "ld1 { v20.h }[2], [x23], #0x2\n"
+      "ld1 { v19.h }[2], [x22], #0x2\n"
+      "ld1 { v18.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v26.b }[6], [x28]\n"
-      "ld1 { v21.b }[6], [x27]\n"
-      "ld1 { v25.b }[6], [x26]\n"
-      "ld1 { v24.b }[6], [x25]\n"
-      "ld1 { v23.b }[6], [x24]\n"
-      "ld1 { v22.b }[6], [x23]\n"
-      "ld1 { v20.b }[6], [x22]\n"
-      "ld1 { v19.b }[6], [x21]\n"
+      "ld1 { v25.b }[6], [x28]\n"
+      "ld1 { v24.b }[6], [x27]\n"
+      "ld1 { v23.b }[6], [x26]\n"
+      "ld1 { v22.b }[6], [x25]\n"
+      "ld1 { v21.b }[6], [x24]\n"
+      "ld1 { v20.b }[6], [x23]\n"
+      "ld1 { v19.b }[6], [x22]\n"
+      "ld1 { v18.b }[6], [x21]\n"
       "b 11f\n"
       "8:"  // odd_loads_1_4
       "mov x20, #0x1\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v26.b }[4], [x28]\n"
-      "ld1 { v21.b }[4], [x27]\n"
-      "ld1 { v25.b }[4], [x26]\n"
-      "ld1 { v24.b }[4], [x25]\n"
-      "ld1 { v23.b }[4], [x24]\n"
-      "ld1 { v22.b }[4], [x23]\n"
-      "ld1 { v20.b }[4], [x22]\n"
-      "ld1 { v19.b }[4], [x21]\n"
+      "ld1 { v25.b }[4], [x28]\n"
+      "ld1 { v24.b }[4], [x27]\n"
+      "ld1 { v23.b }[4], [x26]\n"
+      "ld1 { v22.b }[4], [x25]\n"
+      "ld1 { v21.b }[4], [x24]\n"
+      "ld1 { v20.b }[4], [x23]\n"
+      "ld1 { v19.b }[4], [x22]\n"
+      "ld1 { v18.b }[4], [x21]\n"
       "b 11f\n"
       "9:"  // odd_loads_2_0
       "tbz %x[width], #1, 10f\n"
-      "ldr h26, [x28], #0x2\n"
-      "ldr h21, [x27], #0x2\n"
+      "ldr h25, [x28], #0x2\n"
+      "ldr h24, [x27], #0x2\n"
       "mov x20, #0x1\n"
-      "ldr h25, [x26], #0x2\n"
-      "ldr h24, [x25], #0x2\n"
-      "ldr h23, [x24], #0x2\n"
-      "ldr h22, [x23], #0x2\n"
-      "ldr h20, [x22], #0x2\n"
-      "ldr h19, [x21], #0x2\n"
+      "ldr h23, [x26], #0x2\n"
+      "ldr h22, [x25], #0x2\n"
+      "ldr h21, [x24], #0x2\n"
+      "ldr h20, [x23], #0x2\n"
+      "ldr h19, [x22], #0x2\n"
+      "ldr h18, [x21], #0x2\n"
       "tbz %x[width], #0, 11f\n"
-      "ld1 { v26.b }[2], [x28]\n"
-      "ld1 { v21.b }[2], [x27]\n"
-      "ld1 { v25.b }[2], [x26]\n"
-      "ld1 { v24.b }[2], [x25]\n"
-      "ld1 { v23.b }[2], [x24]\n"
-      "ld1 { v22.b }[2], [x23]\n"
-      "ld1 { v20.b }[2], [x22]\n"
-      "ld1 { v19.b }[2], [x21]\n"
+      "ld1 { v25.b }[2], [x28]\n"
+      "ld1 { v24.b }[2], [x27]\n"
+      "ld1 { v23.b }[2], [x26]\n"
+      "ld1 { v22.b }[2], [x25]\n"
+      "ld1 { v21.b }[2], [x24]\n"
+      "ld1 { v20.b }[2], [x23]\n"
+      "ld1 { v19.b }[2], [x22]\n"
+      "ld1 { v18.b }[2], [x21]\n"
       "b 11f\n"
       "10:"  // odd_loads_1_0
-      "ldr b26, [x28, #0x0]\n"
-      "ldr b21, [x27, #0x0]\n"
+      "ldr b25, [x28, #0x0]\n"
+      "ldr b24, [x27, #0x0]\n"
       "mov x20, #0x1\n"
-      "ldr b25, [x26, #0x0]\n"
-      "ldr b24, [x25, #0x0]\n"
-      "ldr b23, [x24, #0x0]\n"
-      "ldr b22, [x23, #0x0]\n"
-      "ldr b20, [x22, #0x0]\n"
-      "ldr b19, [x21, #0x0]\n"
+      "ldr b23, [x26, #0x0]\n"
+      "ldr b22, [x25, #0x0]\n"
+      "ldr b21, [x24, #0x0]\n"
+      "ldr b20, [x23, #0x0]\n"
+      "ldr b19, [x22, #0x0]\n"
+      "ldr b18, [x21, #0x0]\n"
       "11:"  // Odd load end
       "subs x20, x20, #0x1\n"
-      "zip1 v16.2d, v26.2d, v21.2d\n"
+      "zip1 v16.2d, v25.2d, v24.2d\n"
       "str q16, [%x[out_ptr], #0x0]\n"
-      "zip1 v18.2d, v25.2d, v24.2d\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
-      "zip1 v17.2d, v23.2d, v22.2d\n"
-      "zip1 v16.2d, v20.2d, v19.2d\n"
+      "zip1 v16.2d, v23.2d, v22.2d\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip1 v17.2d, v21.2d, v20.2d\n"
+      "zip1 v16.2d, v19.2d, v18.2d\n"
       "str q17, [%x[out_ptr], #0x20]\n"
       "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "beq 12f\n"
-      "zip2 v21.2d, v26.2d, v21.2d\n"
-      "str q21, [%x[out_ptr], #0x0]\n"
-      "zip2 v18.2d, v25.2d, v24.2d\n"
-      "str q18, [%x[out_ptr], #0x10]\n"
-      "zip2 v17.2d, v23.2d, v22.2d\n"
-      "zip2 v16.2d, v20.2d, v19.2d\n"
+      "zip2 v16.2d, v25.2d, v24.2d\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v23.2d, v22.2d\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v17.2d, v21.2d, v20.2d\n"
+      "zip2 v16.2d, v19.2d, v18.2d\n"
       "str q17, [%x[out_ptr], #0x20]\n"
       "str q16, [%x[out_ptr], #0x30]\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "12:"  // Odds skip
-
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
-      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
index c6ad294..a2288e8 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
@@ -156,182 +156,182 @@
       "cbz %x[width], 14f\n"
       "tbz %x[width], #3, 9f\n"
       "ldr d27, [x28], #0x8\n"
-      "ldr d19, [x27], #0x8\n"
+      "ldr d26, [x27], #0x8\n"
       "ldr d25, [x26], #0x8\n"
-      "ldr d18, [x25], #0x8\n"
+      "ldr d24, [x25], #0x8\n"
       "ldr d23, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
       "ldr d21, [x22], #0x8\n"
-      "ldr d16, [x21], #0x8\n"
+      "ldr d20, [x21], #0x8\n"
       "tbz %x[width], #2, 7f\n"
       "ld1 { v27.s }[2], [x28], #0x4\n"
-      "ld1 { v19.s }[2], [x27], #0x4\n"
+      "ld1 { v26.s }[2], [x27], #0x4\n"
       "ld1 { v25.s }[2], [x26], #0x4\n"
-      "ld1 { v18.s }[2], [x25], #0x4\n"
+      "ld1 { v24.s }[2], [x25], #0x4\n"
       "ld1 { v23.s }[2], [x24], #0x4\n"
-      "ld1 { v17.s }[2], [x23], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
       "ld1 { v21.s }[2], [x22], #0x4\n"
-      "ld1 { v16.s }[2], [x21], #0x4\n"
+      "ld1 { v20.s }[2], [x21], #0x4\n"
       "tbz %x[width], #1, 6f\n"
       "ld1 { v27.h }[6], [x28], #0x2\n"
-      "ld1 { v19.h }[6], [x27], #0x2\n"
+      "ld1 { v26.h }[6], [x27], #0x2\n"
       "mov x20, #0x2\n"
       "ld1 { v25.h }[6], [x26], #0x2\n"
-      "ld1 { v18.h }[6], [x25], #0x2\n"
+      "ld1 { v24.h }[6], [x25], #0x2\n"
       "ld1 { v23.h }[6], [x24], #0x2\n"
-      "ld1 { v17.h }[6], [x23], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
       "ld1 { v21.h }[6], [x22], #0x2\n"
-      "ld1 { v16.h }[6], [x21], #0x2\n"
+      "ld1 { v20.h }[6], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
       "ld1 { v27.b }[14], [x28]\n"
-      "ld1 { v19.b }[14], [x27]\n"
+      "ld1 { v26.b }[14], [x27]\n"
       "ld1 { v25.b }[14], [x26]\n"
-      "ld1 { v18.b }[14], [x25]\n"
+      "ld1 { v24.b }[14], [x25]\n"
       "ld1 { v23.b }[14], [x24]\n"
-      "ld1 { v17.b }[14], [x23]\n"
+      "ld1 { v22.b }[14], [x23]\n"
       "ld1 { v21.b }[14], [x22]\n"
-      "ld1 { v16.b }[14], [x21]\n"
+      "ld1 { v20.b }[14], [x21]\n"
       "b 13f\n"
       "6:"  // odd_loads_1_12
       "mov x20, #0x2\n"
       "tbz %x[width], #0, 13f\n"
       "ld1 { v27.b }[12], [x28]\n"
-      "ld1 { v19.b }[12], [x27]\n"
+      "ld1 { v26.b }[12], [x27]\n"
       "ld1 { v25.b }[12], [x26]\n"
-      "ld1 { v18.b }[12], [x25]\n"
+      "ld1 { v24.b }[12], [x25]\n"
       "ld1 { v23.b }[12], [x24]\n"
-      "ld1 { v17.b }[12], [x23]\n"
+      "ld1 { v22.b }[12], [x23]\n"
       "ld1 { v21.b }[12], [x22]\n"
-      "ld1 { v16.b }[12], [x21]\n"
+      "ld1 { v20.b }[12], [x21]\n"
       "b 13f\n"
       "7:"  // odd_loads_2_8
       "tbz %x[width], #1, 8f\n"
       "ld1 { v27.h }[4], [x28], #0x2\n"
-      "ld1 { v19.h }[4], [x27], #0x2\n"
+      "ld1 { v26.h }[4], [x27], #0x2\n"
       "mov x20, #0x2\n"
       "ld1 { v25.h }[4], [x26], #0x2\n"
-      "ld1 { v18.h }[4], [x25], #0x2\n"
+      "ld1 { v24.h }[4], [x25], #0x2\n"
       "ld1 { v23.h }[4], [x24], #0x2\n"
-      "ld1 { v17.h }[4], [x23], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
       "ld1 { v21.h }[4], [x22], #0x2\n"
-      "ld1 { v16.h }[4], [x21], #0x2\n"
+      "ld1 { v20.h }[4], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
       "ld1 { v27.b }[10], [x28]\n"
-      "ld1 { v19.b }[10], [x27]\n"
+      "ld1 { v26.b }[10], [x27]\n"
       "ld1 { v25.b }[10], [x26]\n"
-      "ld1 { v18.b }[10], [x25]\n"
+      "ld1 { v24.b }[10], [x25]\n"
       "ld1 { v23.b }[10], [x24]\n"
-      "ld1 { v17.b }[10], [x23]\n"
+      "ld1 { v22.b }[10], [x23]\n"
       "ld1 { v21.b }[10], [x22]\n"
-      "ld1 { v16.b }[10], [x21]\n"
+      "ld1 { v20.b }[10], [x21]\n"
       "b 13f\n"
       "8:"  // odd_loads_1_8
       "mov x20, #0x1\n"
       "tbz %x[width], #0, 13f\n"
       "ld1 { v27.b }[8], [x28]\n"
-      "ld1 { v19.b }[8], [x27]\n"
+      "ld1 { v26.b }[8], [x27]\n"
       "mov x20, #0x2\n"
       "ld1 { v25.b }[8], [x26]\n"
-      "ld1 { v18.b }[8], [x25]\n"
+      "ld1 { v24.b }[8], [x25]\n"
       "ld1 { v23.b }[8], [x24]\n"
-      "ld1 { v17.b }[8], [x23]\n"
+      "ld1 { v22.b }[8], [x23]\n"
       "ld1 { v21.b }[8], [x22]\n"
-      "ld1 { v16.b }[8], [x21]\n"
+      "ld1 { v20.b }[8], [x21]\n"
       "b 13f\n"
       "9:"  // odd_loads_4_0
       "tbz %x[width], #2, 11f\n"
       "ldr s27, [x28], #0x4\n"
-      "ldr s19, [x27], #0x4\n"
+      "ldr s26, [x27], #0x4\n"
       "ldr s25, [x26], #0x4\n"
-      "ldr s18, [x25], #0x4\n"
+      "ldr s24, [x25], #0x4\n"
       "ldr s23, [x24], #0x4\n"
-      "ldr s17, [x23], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
       "ldr s21, [x22], #0x4\n"
-      "ldr s16, [x21], #0x4\n"
+      "ldr s20, [x21], #0x4\n"
       "tbz %x[width], #1, 10f\n"
       "ld1 { v27.h }[2], [x28], #0x2\n"
-      "ld1 { v19.h }[2], [x27], #0x2\n"
+      "ld1 { v26.h }[2], [x27], #0x2\n"
       "mov x20, #0x1\n"
       "ld1 { v25.h }[2], [x26], #0x2\n"
-      "ld1 { v18.h }[2], [x25], #0x2\n"
+      "ld1 { v24.h }[2], [x25], #0x2\n"
       "ld1 { v23.h }[2], [x24], #0x2\n"
-      "ld1 { v17.h }[2], [x23], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
       "ld1 { v21.h }[2], [x22], #0x2\n"
-      "ld1 { v16.h }[2], [x21], #0x2\n"
+      "ld1 { v20.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
       "ld1 { v27.b }[6], [x28]\n"
-      "ld1 { v19.b }[6], [x27]\n"
+      "ld1 { v26.b }[6], [x27]\n"
       "ld1 { v25.b }[6], [x26]\n"
-      "ld1 { v18.b }[6], [x25]\n"
+      "ld1 { v24.b }[6], [x25]\n"
       "ld1 { v23.b }[6], [x24]\n"
-      "ld1 { v17.b }[6], [x23]\n"
+      "ld1 { v22.b }[6], [x23]\n"
       "ld1 { v21.b }[6], [x22]\n"
-      "ld1 { v16.b }[6], [x21]\n"
+      "ld1 { v20.b }[6], [x21]\n"
       "b 13f\n"
       "10:"  // odd_loads_1_4
       "mov x20, #0x1\n"
       "tbz %x[width], #0, 13f\n"
       "ld1 { v27.b }[4], [x28]\n"
-      "ld1 { v19.b }[4], [x27]\n"
+      "ld1 { v26.b }[4], [x27]\n"
       "ld1 { v25.b }[4], [x26]\n"
-      "ld1 { v18.b }[4], [x25]\n"
+      "ld1 { v24.b }[4], [x25]\n"
       "ld1 { v23.b }[4], [x24]\n"
-      "ld1 { v17.b }[4], [x23]\n"
+      "ld1 { v22.b }[4], [x23]\n"
       "ld1 { v21.b }[4], [x22]\n"
-      "ld1 { v16.b }[4], [x21]\n"
+      "ld1 { v20.b }[4], [x21]\n"
       "b 13f\n"
       "11:"  // odd_loads_2_0
       "tbz %x[width], #1, 12f\n"
       "ldr h27, [x28], #0x2\n"
-      "ldr h19, [x27], #0x2\n"
+      "ldr h26, [x27], #0x2\n"
       "mov x20, #0x1\n"
       "ldr h25, [x26], #0x2\n"
-      "ldr h18, [x25], #0x2\n"
+      "ldr h24, [x25], #0x2\n"
       "ldr h23, [x24], #0x2\n"
-      "ldr h17, [x23], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
       "ldr h21, [x22], #0x2\n"
-      "ldr h16, [x21], #0x2\n"
+      "ldr h20, [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
       "ld1 { v27.b }[2], [x28]\n"
-      "ld1 { v19.b }[2], [x27]\n"
+      "ld1 { v26.b }[2], [x27]\n"
       "ld1 { v25.b }[2], [x26]\n"
-      "ld1 { v18.b }[2], [x25]\n"
+      "ld1 { v24.b }[2], [x25]\n"
       "ld1 { v23.b }[2], [x24]\n"
-      "ld1 { v17.b }[2], [x23]\n"
+      "ld1 { v22.b }[2], [x23]\n"
       "ld1 { v21.b }[2], [x22]\n"
-      "ld1 { v16.b }[2], [x21]\n"
+      "ld1 { v20.b }[2], [x21]\n"
       "b 13f\n"
       "12:"  // odd_loads_1_0
       "ldr b27, [x28, #0x0]\n"
-      "ldr b19, [x27, #0x0]\n"
+      "ldr b26, [x27, #0x0]\n"
       "mov x20, #0x1\n"
       "ldr b25, [x26, #0x0]\n"
-      "ldr b18, [x25, #0x0]\n"
+      "ldr b24, [x25, #0x0]\n"
       "ldr b23, [x24, #0x0]\n"
-      "ldr b17, [x23, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
       "ldr b21, [x22, #0x0]\n"
-      "ldr b16, [x21, #0x0]\n"
+      "ldr b20, [x21, #0x0]\n"
       "13:"  // Odd load end
-      "zip1 v26.2d, v27.2d, v19.2d\n"
-      "zip1 v24.2d, v25.2d, v18.2d\n"
+      "zip1 v19.2d, v27.2d, v26.2d\n"
+      "zip1 v18.2d, v25.2d, v24.2d\n"
       "subs x20, x20, #0x1\n"
-      "str q26, [%x[out_ptr], #0x0]\n"
-      "zip1 v22.2d, v23.2d, v17.2d\n"
-      "zip1 v20.2d, v21.2d, v16.2d\n"
-      "str q24, [%x[out_ptr], #0x10]\n"
-      "sadalp v5.8h, v26.16b\n"
-      "sadalp v4.8h, v24.16b\n"
-      "str q22, [%x[out_ptr], #0x20]\n"
-      "sadalp v3.8h, v22.16b\n"
-      "str q20, [%x[out_ptr], #0x30]\n"
-      "sadalp v2.8h, v20.16b\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "zip1 v17.2d, v23.2d, v22.2d\n"
+      "zip1 v16.2d, v21.2d, v20.2d\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "sadalp v5.8h, v19.16b\n"
+      "sadalp v4.8h, v18.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "sadalp v3.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "sadalp v2.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "beq 14f\n"
-      "zip2 v19.2d, v27.2d, v19.2d\n"
-      "zip2 v18.2d, v25.2d, v18.2d\n"
+      "zip2 v19.2d, v27.2d, v26.2d\n"
+      "zip2 v18.2d, v25.2d, v24.2d\n"
       "str q19, [%x[out_ptr], #0x0]\n"
-      "zip2 v17.2d, v23.2d, v17.2d\n"
-      "zip2 v16.2d, v21.2d, v16.2d\n"
+      "zip2 v17.2d, v23.2d, v22.2d\n"
+      "zip2 v16.2d, v21.2d, v20.2d\n"
       "str q18, [%x[out_ptr], #0x10]\n"
       "sadalp v5.8h, v19.16b\n"
       "sadalp v4.8h, v18.16b\n"
@@ -346,11 +346,11 @@
       "sadalp v31.4s, v3.8h\n"
       "sadalp v30.4s, v2.8h\n"
       "addp v1.4s, v1.4s, v0.4s\n"
-      "addp v0.4s, v31.4s, v30.4s\n"
+      "addp v16.4s, v31.4s, v30.4s\n"
       "add v1.4s, v1.4s, v29.4s\n"
-      "add v0.4s, v0.4s, v28.4s\n"
+      "add v16.4s, v16.4s, v28.4s\n"
       "str q1, [%x[out_ptr], #0x0]\n"
-      "str q0, [%x[out_ptr], #0x10]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
index 6c4a5fa..56d34a8 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
@@ -156,182 +156,182 @@
       "cbz %x[width], 14f\n"
       "tbz %x[width], #3, 9f\n"
       "ldr d27, [x28], #0x8\n"
-      "ldr d19, [x27], #0x8\n"
+      "ldr d26, [x27], #0x8\n"
       "ldr d25, [x26], #0x8\n"
-      "ldr d18, [x25], #0x8\n"
+      "ldr d24, [x25], #0x8\n"
       "ldr d23, [x24], #0x8\n"
-      "ldr d17, [x23], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
       "ldr d21, [x22], #0x8\n"
-      "ldr d16, [x21], #0x8\n"
+      "ldr d20, [x21], #0x8\n"
       "tbz %x[width], #2, 7f\n"
       "ld1 { v27.s }[2], [x28], #0x4\n"
-      "ld1 { v19.s }[2], [x27], #0x4\n"
+      "ld1 { v26.s }[2], [x27], #0x4\n"
       "ld1 { v25.s }[2], [x26], #0x4\n"
-      "ld1 { v18.s }[2], [x25], #0x4\n"
+      "ld1 { v24.s }[2], [x25], #0x4\n"
       "ld1 { v23.s }[2], [x24], #0x4\n"
-      "ld1 { v17.s }[2], [x23], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
       "ld1 { v21.s }[2], [x22], #0x4\n"
-      "ld1 { v16.s }[2], [x21], #0x4\n"
+      "ld1 { v20.s }[2], [x21], #0x4\n"
       "tbz %x[width], #1, 6f\n"
       "ld1 { v27.h }[6], [x28], #0x2\n"
-      "ld1 { v19.h }[6], [x27], #0x2\n"
+      "ld1 { v26.h }[6], [x27], #0x2\n"
       "mov x20, #0x2\n"
       "ld1 { v25.h }[6], [x26], #0x2\n"
-      "ld1 { v18.h }[6], [x25], #0x2\n"
+      "ld1 { v24.h }[6], [x25], #0x2\n"
       "ld1 { v23.h }[6], [x24], #0x2\n"
-      "ld1 { v17.h }[6], [x23], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
       "ld1 { v21.h }[6], [x22], #0x2\n"
-      "ld1 { v16.h }[6], [x21], #0x2\n"
+      "ld1 { v20.h }[6], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
       "ld1 { v27.b }[14], [x28]\n"
-      "ld1 { v19.b }[14], [x27]\n"
+      "ld1 { v26.b }[14], [x27]\n"
       "ld1 { v25.b }[14], [x26]\n"
-      "ld1 { v18.b }[14], [x25]\n"
+      "ld1 { v24.b }[14], [x25]\n"
       "ld1 { v23.b }[14], [x24]\n"
-      "ld1 { v17.b }[14], [x23]\n"
+      "ld1 { v22.b }[14], [x23]\n"
       "ld1 { v21.b }[14], [x22]\n"
-      "ld1 { v16.b }[14], [x21]\n"
+      "ld1 { v20.b }[14], [x21]\n"
       "b 13f\n"
       "6:"  // odd_loads_1_12
       "mov x20, #0x2\n"
       "tbz %x[width], #0, 13f\n"
       "ld1 { v27.b }[12], [x28]\n"
-      "ld1 { v19.b }[12], [x27]\n"
+      "ld1 { v26.b }[12], [x27]\n"
       "ld1 { v25.b }[12], [x26]\n"
-      "ld1 { v18.b }[12], [x25]\n"
+      "ld1 { v24.b }[12], [x25]\n"
       "ld1 { v23.b }[12], [x24]\n"
-      "ld1 { v17.b }[12], [x23]\n"
+      "ld1 { v22.b }[12], [x23]\n"
       "ld1 { v21.b }[12], [x22]\n"
-      "ld1 { v16.b }[12], [x21]\n"
+      "ld1 { v20.b }[12], [x21]\n"
       "b 13f\n"
       "7:"  // odd_loads_2_8
       "tbz %x[width], #1, 8f\n"
       "ld1 { v27.h }[4], [x28], #0x2\n"
-      "ld1 { v19.h }[4], [x27], #0x2\n"
+      "ld1 { v26.h }[4], [x27], #0x2\n"
       "mov x20, #0x2\n"
       "ld1 { v25.h }[4], [x26], #0x2\n"
-      "ld1 { v18.h }[4], [x25], #0x2\n"
+      "ld1 { v24.h }[4], [x25], #0x2\n"
       "ld1 { v23.h }[4], [x24], #0x2\n"
-      "ld1 { v17.h }[4], [x23], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
       "ld1 { v21.h }[4], [x22], #0x2\n"
-      "ld1 { v16.h }[4], [x21], #0x2\n"
+      "ld1 { v20.h }[4], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
       "ld1 { v27.b }[10], [x28]\n"
-      "ld1 { v19.b }[10], [x27]\n"
+      "ld1 { v26.b }[10], [x27]\n"
       "ld1 { v25.b }[10], [x26]\n"
-      "ld1 { v18.b }[10], [x25]\n"
+      "ld1 { v24.b }[10], [x25]\n"
       "ld1 { v23.b }[10], [x24]\n"
-      "ld1 { v17.b }[10], [x23]\n"
+      "ld1 { v22.b }[10], [x23]\n"
       "ld1 { v21.b }[10], [x22]\n"
-      "ld1 { v16.b }[10], [x21]\n"
+      "ld1 { v20.b }[10], [x21]\n"
       "b 13f\n"
       "8:"  // odd_loads_1_8
       "mov x20, #0x1\n"
       "tbz %x[width], #0, 13f\n"
       "ld1 { v27.b }[8], [x28]\n"
-      "ld1 { v19.b }[8], [x27]\n"
+      "ld1 { v26.b }[8], [x27]\n"
       "mov x20, #0x2\n"
       "ld1 { v25.b }[8], [x26]\n"
-      "ld1 { v18.b }[8], [x25]\n"
+      "ld1 { v24.b }[8], [x25]\n"
       "ld1 { v23.b }[8], [x24]\n"
-      "ld1 { v17.b }[8], [x23]\n"
+      "ld1 { v22.b }[8], [x23]\n"
       "ld1 { v21.b }[8], [x22]\n"
-      "ld1 { v16.b }[8], [x21]\n"
+      "ld1 { v20.b }[8], [x21]\n"
       "b 13f\n"
       "9:"  // odd_loads_4_0
       "tbz %x[width], #2, 11f\n"
       "ldr s27, [x28], #0x4\n"
-      "ldr s19, [x27], #0x4\n"
+      "ldr s26, [x27], #0x4\n"
       "ldr s25, [x26], #0x4\n"
-      "ldr s18, [x25], #0x4\n"
+      "ldr s24, [x25], #0x4\n"
       "ldr s23, [x24], #0x4\n"
-      "ldr s17, [x23], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
       "ldr s21, [x22], #0x4\n"
-      "ldr s16, [x21], #0x4\n"
+      "ldr s20, [x21], #0x4\n"
       "tbz %x[width], #1, 10f\n"
       "ld1 { v27.h }[2], [x28], #0x2\n"
-      "ld1 { v19.h }[2], [x27], #0x2\n"
+      "ld1 { v26.h }[2], [x27], #0x2\n"
       "mov x20, #0x1\n"
       "ld1 { v25.h }[2], [x26], #0x2\n"
-      "ld1 { v18.h }[2], [x25], #0x2\n"
+      "ld1 { v24.h }[2], [x25], #0x2\n"
       "ld1 { v23.h }[2], [x24], #0x2\n"
-      "ld1 { v17.h }[2], [x23], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
       "ld1 { v21.h }[2], [x22], #0x2\n"
-      "ld1 { v16.h }[2], [x21], #0x2\n"
+      "ld1 { v20.h }[2], [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
       "ld1 { v27.b }[6], [x28]\n"
-      "ld1 { v19.b }[6], [x27]\n"
+      "ld1 { v26.b }[6], [x27]\n"
       "ld1 { v25.b }[6], [x26]\n"
-      "ld1 { v18.b }[6], [x25]\n"
+      "ld1 { v24.b }[6], [x25]\n"
       "ld1 { v23.b }[6], [x24]\n"
-      "ld1 { v17.b }[6], [x23]\n"
+      "ld1 { v22.b }[6], [x23]\n"
       "ld1 { v21.b }[6], [x22]\n"
-      "ld1 { v16.b }[6], [x21]\n"
+      "ld1 { v20.b }[6], [x21]\n"
       "b 13f\n"
       "10:"  // odd_loads_1_4
       "mov x20, #0x1\n"
       "tbz %x[width], #0, 13f\n"
       "ld1 { v27.b }[4], [x28]\n"
-      "ld1 { v19.b }[4], [x27]\n"
+      "ld1 { v26.b }[4], [x27]\n"
       "ld1 { v25.b }[4], [x26]\n"
-      "ld1 { v18.b }[4], [x25]\n"
+      "ld1 { v24.b }[4], [x25]\n"
       "ld1 { v23.b }[4], [x24]\n"
-      "ld1 { v17.b }[4], [x23]\n"
+      "ld1 { v22.b }[4], [x23]\n"
       "ld1 { v21.b }[4], [x22]\n"
-      "ld1 { v16.b }[4], [x21]\n"
+      "ld1 { v20.b }[4], [x21]\n"
       "b 13f\n"
       "11:"  // odd_loads_2_0
       "tbz %x[width], #1, 12f\n"
       "ldr h27, [x28], #0x2\n"
-      "ldr h19, [x27], #0x2\n"
+      "ldr h26, [x27], #0x2\n"
       "mov x20, #0x1\n"
       "ldr h25, [x26], #0x2\n"
-      "ldr h18, [x25], #0x2\n"
+      "ldr h24, [x25], #0x2\n"
       "ldr h23, [x24], #0x2\n"
-      "ldr h17, [x23], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
       "ldr h21, [x22], #0x2\n"
-      "ldr h16, [x21], #0x2\n"
+      "ldr h20, [x21], #0x2\n"
       "tbz %x[width], #0, 13f\n"
       "ld1 { v27.b }[2], [x28]\n"
-      "ld1 { v19.b }[2], [x27]\n"
+      "ld1 { v26.b }[2], [x27]\n"
       "ld1 { v25.b }[2], [x26]\n"
-      "ld1 { v18.b }[2], [x25]\n"
+      "ld1 { v24.b }[2], [x25]\n"
       "ld1 { v23.b }[2], [x24]\n"
-      "ld1 { v17.b }[2], [x23]\n"
+      "ld1 { v22.b }[2], [x23]\n"
       "ld1 { v21.b }[2], [x22]\n"
-      "ld1 { v16.b }[2], [x21]\n"
+      "ld1 { v20.b }[2], [x21]\n"
       "b 13f\n"
       "12:"  // odd_loads_1_0
       "ldr b27, [x28, #0x0]\n"
-      "ldr b19, [x27, #0x0]\n"
+      "ldr b26, [x27, #0x0]\n"
       "mov x20, #0x1\n"
       "ldr b25, [x26, #0x0]\n"
-      "ldr b18, [x25, #0x0]\n"
+      "ldr b24, [x25, #0x0]\n"
       "ldr b23, [x24, #0x0]\n"
-      "ldr b17, [x23, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
       "ldr b21, [x22, #0x0]\n"
-      "ldr b16, [x21, #0x0]\n"
+      "ldr b20, [x21, #0x0]\n"
       "13:"  // Odd load end
-      "zip1 v26.2d, v27.2d, v19.2d\n"
-      "zip1 v24.2d, v25.2d, v18.2d\n"
+      "zip1 v19.2d, v27.2d, v26.2d\n"
+      "zip1 v18.2d, v25.2d, v24.2d\n"
       "subs x20, x20, #0x1\n"
-      "str q26, [%x[out_ptr], #0x0]\n"
-      "zip1 v22.2d, v23.2d, v17.2d\n"
-      "zip1 v20.2d, v21.2d, v16.2d\n"
-      "str q24, [%x[out_ptr], #0x10]\n"
-      "uadalp v5.8h, v26.16b\n"
-      "uadalp v4.8h, v24.16b\n"
-      "str q22, [%x[out_ptr], #0x20]\n"
-      "uadalp v3.8h, v22.16b\n"
-      "str q20, [%x[out_ptr], #0x30]\n"
-      "uadalp v2.8h, v20.16b\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "zip1 v17.2d, v23.2d, v22.2d\n"
+      "zip1 v16.2d, v21.2d, v20.2d\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "uadalp v5.8h, v19.16b\n"
+      "uadalp v4.8h, v18.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "uadalp v3.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "uadalp v2.8h, v16.16b\n"
       "add %x[out_ptr], %x[out_ptr], #0x40\n"
       "beq 14f\n"
-      "zip2 v19.2d, v27.2d, v19.2d\n"
-      "zip2 v18.2d, v25.2d, v18.2d\n"
+      "zip2 v19.2d, v27.2d, v26.2d\n"
+      "zip2 v18.2d, v25.2d, v24.2d\n"
       "str q19, [%x[out_ptr], #0x0]\n"
-      "zip2 v17.2d, v23.2d, v17.2d\n"
-      "zip2 v16.2d, v21.2d, v16.2d\n"
+      "zip2 v17.2d, v23.2d, v22.2d\n"
+      "zip2 v16.2d, v21.2d, v20.2d\n"
       "str q18, [%x[out_ptr], #0x10]\n"
       "uadalp v5.8h, v19.16b\n"
       "uadalp v4.8h, v18.16b\n"
@@ -346,11 +346,11 @@
       "uadalp v31.4s, v3.8h\n"
       "uadalp v30.4s, v2.8h\n"
       "addp v1.4s, v1.4s, v0.4s\n"
-      "addp v0.4s, v31.4s, v30.4s\n"
+      "addp v16.4s, v31.4s, v30.4s\n"
       "add v1.4s, v1.4s, v29.4s\n"
-      "add v0.4s, v0.4s, v28.4s\n"
+      "add v16.4s, v16.4s, v28.4s\n"
       "str q1, [%x[out_ptr], #0x0]\n"
-      "str q0, [%x[out_ptr], #0x10]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
       "add %x[out_ptr], %x[out_ptr], #0x20\n"
       : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
       : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp
index 51b91d1..a5f4754 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
 
 template <>
 void interleave_block<1, 2, VLType::SME, false>(
   bfloat16 * &out, const float * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntw x22, ALL, MUL #2\n"
@@ -153,4 +151,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp
index 25bfad1..c1d0ac5 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
 
 template <>
 void interleave_block<2, 2, VLType::SME, false>(
   bfloat16 * &out, const float * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntw x22, ALL, MUL #2\n"
@@ -184,4 +182,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp
index 9255831..03575d7 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
 
 template <>
 void interleave_block<4, 2, VLType::SME, false>(
   bfloat16 * &out, const float * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntw x23, ALL, MUL #2\n"
@@ -159,4 +157,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp
index 9b66a6f..453778a 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<1, 1, VLType::SME, false>(
   bfloat16 * &out, const bfloat16 * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "mov x21, %x[width]\n"
@@ -168,9 +166,9 @@
       "9:"  // K loop: Tails: Even: First
       ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
       ".inst 0xe07f82a0  // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
-      "ldr x25, [x26, #0x0]\n"
+      "ldr x20, [x26, #0x0]\n"
       ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
-      ".inst 0xe0560328  // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0xe0560288  // ld1h { za1h.h[x12] }, p0/Z, [x20, x22, LSL #1]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x11\n"
       "add x26, x26, #0x8\n"
@@ -186,7 +184,7 @@
       "cmp x12, x10\n"
       "addvl x21, x21, #1\n"
       "blt 10b\n"
-      "whilelt p9.h, x27, %x[width]\n"
+      "whilelt p8.h, x27, %x[width]\n"
       "b 13f\n"
       "11:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
@@ -206,4 +204,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp
index d0375de..98bdcd2 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<1, 2, VLType::SME, false>(
   bfloat16 * &out, const bfloat16 * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cnth x22\n"
@@ -176,11 +174,11 @@
       "9:"  // K loop: Tails: Even: First
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
-      "ldr x25, [x26, #0x0]\n"
+      "ldr x20, [x26, #0x0]\n"
       "add x12, x12, #0x1\n"
       ".inst 0x25396140  // psel p0.h, p8.h/Z, p10.h[w13, #1]\n"
       "cmp x12, x10\n"
-      ".inst 0xe0562321  // ld1h { za0h.h[x13, #1] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0xe0562281  // ld1h { za0h.h[x13, #1] }, p0/Z, [x20, x22, LSL #1]\n"
       "add x26, x26, #0x8\n"
       "addvl x21, x21, #1\n"
       "add x13, x13, #0x2\n"
@@ -197,7 +195,7 @@
       "addvl x21, x21, #1\n"
       "add x20, x20, #0x2\n"
       "blt 10b\n"
-      "whilelt p9.h, x27, %x[width]\n"
+      "whilelt p8.h, x27, %x[width]\n"
       "b 13f\n"
       "11:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
@@ -217,4 +215,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp
index 622d9aa..4390bb7 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<1, 4, VLType::SME, false>(
   int8_t * &out, const int8_t * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntb x21\n"
@@ -179,11 +177,11 @@
       "9:"  // K loop: Tails: Even: First
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
-      "ldr x25, [x26, #0x0]\n"
+      "ldr x20, [x26, #0x0]\n"
       "add x12, x12, #0x1\n"
       ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
       "cmp x12, x9\n"
-      ".inst 0xe0162322  // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n"
+      ".inst 0xe0162282  // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
       "add x26, x26, #0x8\n"
       "addvl x21, x21, #1\n"
       "add x13, x13, #0x4\n"
@@ -200,7 +198,7 @@
       "addvl x21, x21, #1\n"
       "add x20, x20, #0x4\n"
       "blt 10b\n"
-      "whilelt p9.b, x27, %x[width]\n"
+      "whilelt p8.b, x27, %x[width]\n"
       "b 13f\n"
       "11:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
@@ -220,4 +218,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp
index 07f0370..f5ee261 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<1, 4, VLType::SME, true>(
@@ -200,12 +200,12 @@
       "10:"  // K loop: Tails: Even: First
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0bf8300  // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
-      "ldr x22, [x23, #0x0]\n"
+      "ldr x20, [x23, #0x0]\n"
       ".inst 0xc0828810  // mova z16.s, p2/M, za0v.s[x12]\n"
       "add x12, x12, #0x1\n"
       ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
       "sdot z17.s, z16.b, z18.b\n"
-      ".inst 0xe01922c2  // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n"
+      ".inst 0xe0192282  // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x25]\n"
       "cmp x12, x9\n"
       "add x23, x23, #0x8\n"
       "addvl x24, x24, #1\n"
@@ -225,7 +225,7 @@
       "addvl x24, x24, #1\n"
       "add x20, x20, #0x4\n"
       "blt 11b\n"
-      "whilelt p9.b, x28, %x[width]\n"
+      "whilelt p8.b, x28, %x[width]\n"
       "b 14f\n"
       "12:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
@@ -249,4 +249,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp
index 618570d..76c1d05 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<1, 4, VLType::SME, false>(
   uint8_t * &out, const uint8_t * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntb x21\n"
@@ -179,11 +177,11 @@
       "9:"  // K loop: Tails: Even: First
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
-      "ldr x25, [x26, #0x0]\n"
+      "ldr x20, [x26, #0x0]\n"
       "add x12, x12, #0x1\n"
       ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
       "cmp x12, x9\n"
-      ".inst 0xe0162322  // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n"
+      ".inst 0xe0162282  // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
       "add x26, x26, #0x8\n"
       "addvl x21, x21, #1\n"
       "add x13, x13, #0x4\n"
@@ -200,7 +198,7 @@
       "addvl x21, x21, #1\n"
       "add x20, x20, #0x4\n"
       "blt 10b\n"
-      "whilelt p9.b, x27, %x[width]\n"
+      "whilelt p8.b, x27, %x[width]\n"
       "b 13f\n"
       "11:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
@@ -220,4 +218,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp
index 646db0c..daf2d3a 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<1, 4, VLType::SME, true>(
@@ -200,12 +200,12 @@
       "10:"  // K loop: Tails: Even: First
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0bf8300  // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n"
-      "ldr x22, [x23, #0x0]\n"
+      "ldr x20, [x23, #0x0]\n"
       ".inst 0xc0828810  // mova z16.s, p2/M, za0v.s[x12]\n"
       "add x12, x12, #0x1\n"
       ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
       "udot z17.s, z16.b, z18.b\n"
-      ".inst 0xe01922c2  // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n"
+      ".inst 0xe0192282  // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x25]\n"
       "cmp x12, x9\n"
       "add x23, x23, #0x8\n"
       "addvl x24, x24, #1\n"
@@ -225,7 +225,7 @@
       "addvl x24, x24, #1\n"
       "add x20, x20, #0x4\n"
       "blt 11b\n"
-      "whilelt p9.b, x28, %x[width]\n"
+      "whilelt p8.b, x28, %x[width]\n"
       "b 14f\n"
       "12:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
@@ -249,4 +249,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp
index 788c1a2..274f69f 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<1, 1, VLType::SME, false>(
   __fp16 * &out, const __fp16 * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "mov x21, %x[width]\n"
@@ -168,9 +166,9 @@
       "9:"  // K loop: Tails: Even: First
       ".inst 0x25286d20  // psel p0.h, p11.h/Z, p9.h[w12]\n"
       ".inst 0xe07f82a0  // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n"
-      "ldr x25, [x26, #0x0]\n"
+      "ldr x20, [x26, #0x0]\n"
       ".inst 0x25286140  // psel p0.h, p8.h/Z, p10.h[w12]\n"
-      ".inst 0xe0560328  // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n"
+      ".inst 0xe0560288  // ld1h { za1h.h[x12] }, p0/Z, [x20, x22, LSL #1]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x11\n"
       "add x26, x26, #0x8\n"
@@ -186,7 +184,7 @@
       "cmp x12, x10\n"
       "addvl x21, x21, #1\n"
       "blt 10b\n"
-      "whilelt p9.h, x27, %x[width]\n"
+      "whilelt p8.h, x27, %x[width]\n"
       "b 13f\n"
       "11:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
@@ -206,4 +204,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp
index 7de8854..ab29064 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<1, 1, VLType::SME, false>(
   float * &out, const float * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "mov x22, %x[width]\n"
@@ -167,9 +165,9 @@
       "9:"  // K loop: Tails: Even: First
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
-      "ldr x25, [x26, #0x0]\n"
+      "ldr x20, [x26, #0x0]\n"
       ".inst 0x25306140  // psel p0.s, p8.s/Z, p10.s[w12]\n"
-      ".inst 0xe0960328  // ld1w { za2h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n"
+      ".inst 0xe0960288  // ld1w { za2h.s[x12] }, p0/Z, [x20, x22, LSL #2]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x10\n"
       "add x26, x26, #0x8\n"
@@ -185,7 +183,7 @@
       "cmp x12, x9\n"
       "addvl x21, x21, #1\n"
       "blt 10b\n"
-      "whilelt p9.s, x27, %x[width]\n"
+      "whilelt p8.s, x27, %x[width]\n"
       "b 13f\n"
       "11:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
@@ -205,4 +203,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp
index 14ee5d6..dc6d12b 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<2, 1, VLType::SME, false>(
   bfloat16 * &out, const bfloat16 * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cnth x28\n"
@@ -97,4 +95,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp
index f648ccf..d918925 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp
@@ -22,32 +22,30 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<2, 2, VLType::SME, false>(
   bfloat16 * &out, const bfloat16 * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
-      "cnth x21\n"
-      "mov x22, %x[width]\n"
-      "inch x22\n"
+      "cnth x22\n"
+      "mov x21, %x[width]\n"
+      "inch x21\n"
       "mov x20, %x[width]\n"
-      "sub x17, x21, #0x1\n"
-      "sub x22, x22, #0x1\n"
+      "sub x17, x22, #0x1\n"
+      "sub x21, x21, #0x1\n"
       "ands x17, x20, x17\n"
       "cntw x16\n"
-      "udiv x22, x22, x21\n"  // n_passes = ceildiv(width, VL<T>)
-      "csel x17, x17, x21, NE\n"
-      "sub x13, x22, #0x1\n"
+      "udiv x21, x21, x22\n"  // n_passes = ceildiv(width, VL<T>)
+      "csel x17, x17, x22, NE\n"
+      "sub x13, x21, #0x1\n"
       "add x17, x17, #0x1\n"
       "sub x15, x16, #0x2\n"
-      "lsl x21, %x[height], #0x1\n"  // height * 2
+      "lsl x22, %x[height], #0x1\n"  // height * 2
       "lsl x20, x16, #0x1\n"
       "mov x14, #0x0\n"
       "mov x11, %x[in]\n"
@@ -57,15 +55,15 @@
       "cntw x27, ALL, MUL #3\n"
       "ldr x26, [x10, #0x0]\n"
       "lsr x13, x13, #0x1\n"  // n_loops = (n_passes - 1) / 2
-      "and x25, x22, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "and x25, x21, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
       "ldr x24, [x11, #0x8]\n"
       "lsr x17, x17, #0x1\n"
       "ptrue p13.s\n"
-      "ldr x23, [x10, #0x8]\n"
-      "whilelt p12.h, XZR, x21\n"
-      "whilelt p11.h, x20, x21\n"
-      "mov x22, %x[row_offset]\n"
-      "mov x21, %x[out]\n"
+      "ldr x21, [x10, #0x8]\n"
+      "whilelt p12.h, XZR, x22\n"
+      "whilelt p11.h, x20, x22\n"
+      "mov x23, %x[row_offset]\n"
+      "mov x22, %x[out]\n"
       "whilelt p10.h, x14, %x[width]\n"
       "whilelt p9.h, x14, %x[width]\n"
       "whilelt p8.h, x14, %x[width]\n"
@@ -76,39 +74,39 @@
       "1:"  // K loop: Charge: Loop
       ".inst 0x25286581  // psel p1.h, p9.h/Z, p12.h[w12]\n"
       ".inst 0x25286160  // psel p0.h, p8.h/Z, p11.h[w12]\n"
-      ".inst 0xe0560520  // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n"
+      ".inst 0xe0570520  // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
       "ldr x9, [x11, #0x0]\n"
-      ".inst 0xe0560348  // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n"
+      ".inst 0xe0570348  // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
       ".inst 0x25686581  // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
       ".inst 0x25686160  // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
       "ldr x26, [x10, #0x0]\n"
-      ".inst 0xe0560702  // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n"
+      ".inst 0xe0570702  // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
       "ldr x24, [x11, #0x8]\n"
       "add x11, x11, #0x10\n"
-      ".inst 0xe05602ea  // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n"
+      ".inst 0xe05702aa  // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x15, LSL #1\n"
-      "ldr x23, [x10, #0x8]\n"
+      "ldr x21, [x10, #0x8]\n"
       "add x10, x10, #0x10\n"
       "blt 1b\n"
       "2:"  // K loop: Charge: End
       ".inst 0x25286581  // psel p1.h, p9.h/Z, p12.h[w12]\n"
       ".inst 0x25286160  // psel p0.h, p8.h/Z, p11.h[w12]\n"
-      ".inst 0xe0560520  // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n"
-      ".inst 0xe0560348  // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n"
+      ".inst 0xe0570520  // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
+      ".inst 0xe0570348  // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
       ".inst 0x25686581  // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
       ".inst 0x25686160  // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
       "mov x11, %x[in]\n"
       "add x10, %x[in], x16, LSL #3\n"
-      ".inst 0xe0560702  // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n"
+      ".inst 0xe0570702  // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
       "ldr x9, [x11, #0x0]\n"
-      ".inst 0xe05602ea  // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n"
+      ".inst 0xe05702aa  // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
       "ldr x26, [x10, #0x0]\n"
-      "inch x22\n"
+      "inch x23\n"
       "inch x14\n"
       "ldr x24, [x11, #0x8]\n"
       "add x11, x11, #0x10\n"
-      "ldr x23, [x10, #0x8]\n"
+      "ldr x21, [x10, #0x8]\n"
       "add x10, x10, #0x10\n"
       "cbz x13, 8f\n"
       "mov x20, x13\n"
@@ -121,60 +119,60 @@
       "4:"  // K loop: Main loop: First: Loop
       ".inst 0x25396581  // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
       ".inst 0x25396160  // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
-      ".inst 0xe0562521  // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n"
+      ".inst 0xe0572521  // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
       "ldr x9, [x11, #0x0]\n"
-      ".inst 0xe0562349  // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n"
+      ".inst 0xe0572349  // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
       ".inst 0x25796580  // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
       ".inst 0x25796162  // psel p2.h, p8.h/Z, p11.h[w13, #3]\n"
       "ldr x26, [x10, #0x0]\n"
       ".inst 0x25307541  // psel p1.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0562303  // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n"
+      ".inst 0xe0572303  // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
       "ldr x24, [x11, #0x8]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0562aeb  // ld1h { za1h.h[x13, #3] }, p2/Z, [x23, x22, LSL #1]\n"
-      "ldr x23, [x10, #0x8]\n"
-      ".inst 0xe0bf86a0  // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0572aab  // ld1h { za1h.h[x13, #3] }, p2/Z, [x21, x23, LSL #1]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf86c0  // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
-      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
       ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
       "add x11, x11, #0x10\n"
-      ".inst 0xe0bc86a1  // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      ".inst 0xe0bc86c1  // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
       "add x10, x10, #0x10\n"
       "add x13, x13, #0x4\n"
-      ".inst 0xe0bb82a5  // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+      ".inst 0xe0bb82c5  // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
       "add x12, x12, #0x2\n"
       "cmp x12, x15\n"
-      "addvl x21, x21, #4\n"
+      "addvl x22, x22, #4\n"
       "blt 4b\n"
       "5:"  // K loop: Main loop: First: Tail
       ".inst 0x25396581  // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
       ".inst 0x25396160  // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
-      ".inst 0xe0562521  // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n"
-      ".inst 0xe0562349  // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n"
+      ".inst 0xe0572521  // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
+      ".inst 0xe0572349  // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
       "mov x11, %x[in]\n"
       "add x10, %x[in], x16, LSL #3\n"
       "ldr x9, [x11, #0x0]\n"
       ".inst 0x25796580  // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
       ".inst 0x25796161  // psel p1.h, p8.h/Z, p11.h[w13, #3]\n"
-      ".inst 0xe0562303  // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n"
+      ".inst 0xe0572303  // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
       "ldr x26, [x10, #0x0]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe05626eb  // ld1h { za1h.h[x13, #3] }, p1/Z, [x23, x22, LSL #1]\n"
+      ".inst 0xe05726ab  // ld1h { za1h.h[x13, #3] }, p1/Z, [x21, x23, LSL #1]\n"
       "ldr x24, [x11, #0x8]\n"
       ".inst 0x25307542  // psel p2.s, p13.s/Z, p10.s[w12]\n"
-      "ldr x23, [x10, #0x8]\n"
-      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
       ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
-      ".inst 0xe0b08aa4  // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b08ac4  // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
       "whilelt p10.h, x14, %x[width]\n"
       "inch x14\n"
-      ".inst 0xe0bc86a1  // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      ".inst 0xe0bc86c1  // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
       "add x11, x11, #0x10\n"
       "add x10, x10, #0x10\n"
-      ".inst 0xe0bb82a5  // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
-      "addvl x21, x21, #4\n"
-      "inch x22\n"
+      ".inst 0xe0bb82c5  // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+      "addvl x22, x22, #4\n"
+      "inch x23\n"
       "whilelt p9.h, x14, %x[width]\n"
       "whilelt p8.h, x14, %x[width]\n"
       "mov x13, #0x0\n"
@@ -183,61 +181,61 @@
       "6:"  // K loop: Main loop: Second: Loop
       ".inst 0x25296581  // psel p1.h, p9.h/Z, p12.h[w13]\n"
       ".inst 0x25296160  // psel p0.h, p8.h/Z, p11.h[w13]\n"
-      ".inst 0xe0562520  // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n"
+      ".inst 0xe0572520  // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
       "ldr x9, [x11, #0x0]\n"
-      ".inst 0xe0562348  // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n"
+      ".inst 0xe0572348  // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
       ".inst 0x25696580  // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
       ".inst 0x25696162  // psel p2.h, p8.h/Z, p11.h[w13, #2]\n"
       "ldr x26, [x10, #0x0]\n"
       ".inst 0x25307541  // psel p1.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0562302  // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n"
+      ".inst 0xe0572302  // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
       "ldr x24, [x11, #0x8]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0562aea  // ld1h { za1h.h[x13, #2] }, p2/Z, [x23, x22, LSL #1]\n"
-      "ldr x23, [x10, #0x8]\n"
-      ".inst 0xe0bf86a8  // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0572aaa  // ld1h { za1h.h[x13, #2] }, p2/Z, [x21, x23, LSL #1]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf86c8  // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
-      ".inst 0xe0b082ac  // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b082cc  // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
       ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
       "add x11, x11, #0x10\n"
-      ".inst 0xe0bc86a9  // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      ".inst 0xe0bc86c9  // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
       "add x10, x10, #0x10\n"
       "add x13, x13, #0x4\n"
-      ".inst 0xe0bb82ad  // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+      ".inst 0xe0bb82cd  // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
       "add x12, x12, #0x2\n"
       "cmp x12, x15\n"
-      "addvl x21, x21, #4\n"
+      "addvl x22, x22, #4\n"
       "blt 6b\n"
       "7:"  // K loop: Main loop: Second: Tail
       ".inst 0x25296581  // psel p1.h, p9.h/Z, p12.h[w13]\n"
       ".inst 0x25296160  // psel p0.h, p8.h/Z, p11.h[w13]\n"
-      ".inst 0xe0562520  // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n"
-      ".inst 0xe0562348  // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n"
+      ".inst 0xe0572520  // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
+      ".inst 0xe0572348  // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
       "mov x11, %x[in]\n"
       "add x10, %x[in], x16, LSL #3\n"
       "ldr x9, [x11, #0x0]\n"
       ".inst 0x25696580  // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
       ".inst 0x25696161  // psel p1.h, p8.h/Z, p11.h[w13, #2]\n"
-      ".inst 0xe0562302  // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n"
+      ".inst 0xe0572302  // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
       "ldr x26, [x10, #0x0]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe05626ea  // ld1h { za1h.h[x13, #2] }, p1/Z, [x23, x22, LSL #1]\n"
+      ".inst 0xe05726aa  // ld1h { za1h.h[x13, #2] }, p1/Z, [x21, x23, LSL #1]\n"
       "ldr x24, [x11, #0x8]\n"
       ".inst 0x25307542  // psel p2.s, p13.s/Z, p10.s[w12]\n"
-      "ldr x23, [x10, #0x8]\n"
-      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf82c8  // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
       ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
-      ".inst 0xe0b08aac  // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b08acc  // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
       "whilelt p10.h, x14, %x[width]\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xe0bc86a9  // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      ".inst 0xe0bc86c9  // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
       "add x11, x11, #0x10\n"
       "add x10, x10, #0x10\n"
-      ".inst 0xe0bb82ad  // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
-      "addvl x21, x21, #4\n"
+      ".inst 0xe0bb82cd  // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+      "addvl x22, x22, #4\n"
       "inch x14\n"
-      "inch x22\n"
+      "inch x23\n"
       "bgt 3b\n"
       "8:"  // K loop: Tails
       "cbnz x25, 11f\n"
@@ -248,51 +246,51 @@
       "mov x12, #0x0\n"
       "9:"  // K loop: Tails: Even: First
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
-      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      "ldr x21, [x11, #0x0]\n"
       "add x12, x12, #0x1\n"
       ".inst 0x25396581  // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
-      "ldr x26, [x11, x16, LSL #0x3]\n"
+      "ldr x20, [x11, x16, LSL #0x3]\n"
       ".inst 0x25396160  // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
       "cmp x12, x16\n"
-      ".inst 0xe0562521  // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n"
-      ".inst 0xe0562349  // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n"
+      ".inst 0xe05726a1  // ld1h { za0h.h[x13, #1] }, p1/Z, [x21, x23, LSL #1]\n"
+      ".inst 0xe0572289  // ld1h { za1h.h[x13, #1] }, p0/Z, [x20, x23, LSL #1]\n"
       "add x11, x11, #0x8\n"
-      "addvl x21, x21, #2\n"
+      "addvl x22, x22, #2\n"
       "add x13, x13, #0x2\n"
       "blt 9b\n"
       "whilelt p10.h, x14, %x[width]\n"
-      "whilelt p9.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
       "whilelt p8.h, x14, %x[width]\n"
       "mov x20, #0x0\n"
       "mov x12, #0x0\n"
       "10:"  // K loop: Tails: Even: Second
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0bf82c8  // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0b082ac  // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b082cc  // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x17\n"
-      "addvl x21, x21, #2\n"
+      "addvl x22, x22, #2\n"
       "add x20, x20, #0x2\n"
       "blt 10b\n"
-      "whilelt p10.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
       "b 13f\n"
       "11:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
       "12:"  // K loop: Tails: Odd: Loop
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x17\n"
-      "addvl x21, x21, #2\n"
+      "addvl x22, x22, #2\n"
       "blt 12b\n"
       "13:"  // K loop: End
-      "mov %x[out], x21\n"
+      "mov %x[out], x22\n"
       ".inst 0xd503467f  // SMSTOP\n"
       : [out] "+&r" (out)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
@@ -300,4 +298,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp
index 61536d3..ef787c8 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp
@@ -22,32 +22,30 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<2, 2, VLType::SME, false>(
   __fp16 * &out, const __fp16 * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
-      "cnth x21\n"
-      "mov x22, %x[width]\n"
-      "inch x22\n"
+      "cnth x22\n"
+      "mov x21, %x[width]\n"
+      "inch x21\n"
       "mov x20, %x[width]\n"
-      "sub x17, x21, #0x1\n"
-      "sub x22, x22, #0x1\n"
+      "sub x17, x22, #0x1\n"
+      "sub x21, x21, #0x1\n"
       "ands x17, x20, x17\n"
       "cntw x16\n"
-      "udiv x22, x22, x21\n"  // n_passes = ceildiv(width, VL<T>)
-      "csel x17, x17, x21, NE\n"
-      "sub x13, x22, #0x1\n"
+      "udiv x21, x21, x22\n"  // n_passes = ceildiv(width, VL<T>)
+      "csel x17, x17, x22, NE\n"
+      "sub x13, x21, #0x1\n"
       "add x17, x17, #0x1\n"
       "sub x15, x16, #0x2\n"
-      "lsl x21, %x[height], #0x1\n"  // height * 2
+      "lsl x22, %x[height], #0x1\n"  // height * 2
       "lsl x20, x16, #0x1\n"
       "mov x14, #0x0\n"
       "mov x11, %x[in]\n"
@@ -57,15 +55,15 @@
       "cntw x27, ALL, MUL #3\n"
       "ldr x26, [x10, #0x0]\n"
       "lsr x13, x13, #0x1\n"  // n_loops = (n_passes - 1) / 2
-      "and x25, x22, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
+      "and x25, x21, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
       "ldr x24, [x11, #0x8]\n"
       "lsr x17, x17, #0x1\n"
       "ptrue p13.s\n"
-      "ldr x23, [x10, #0x8]\n"
-      "whilelt p12.h, XZR, x21\n"
-      "whilelt p11.h, x20, x21\n"
-      "mov x22, %x[row_offset]\n"
-      "mov x21, %x[out]\n"
+      "ldr x21, [x10, #0x8]\n"
+      "whilelt p12.h, XZR, x22\n"
+      "whilelt p11.h, x20, x22\n"
+      "mov x23, %x[row_offset]\n"
+      "mov x22, %x[out]\n"
       "whilelt p10.h, x14, %x[width]\n"
       "whilelt p9.h, x14, %x[width]\n"
       "whilelt p8.h, x14, %x[width]\n"
@@ -76,39 +74,39 @@
       "1:"  // K loop: Charge: Loop
       ".inst 0x25286581  // psel p1.h, p9.h/Z, p12.h[w12]\n"
       ".inst 0x25286160  // psel p0.h, p8.h/Z, p11.h[w12]\n"
-      ".inst 0xe0560520  // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n"
+      ".inst 0xe0570520  // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
       "ldr x9, [x11, #0x0]\n"
-      ".inst 0xe0560348  // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n"
+      ".inst 0xe0570348  // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
       ".inst 0x25686581  // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
       ".inst 0x25686160  // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
       "ldr x26, [x10, #0x0]\n"
-      ".inst 0xe0560702  // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n"
+      ".inst 0xe0570702  // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
       "ldr x24, [x11, #0x8]\n"
       "add x11, x11, #0x10\n"
-      ".inst 0xe05602ea  // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n"
+      ".inst 0xe05702aa  // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x15, LSL #1\n"
-      "ldr x23, [x10, #0x8]\n"
+      "ldr x21, [x10, #0x8]\n"
       "add x10, x10, #0x10\n"
       "blt 1b\n"
       "2:"  // K loop: Charge: End
       ".inst 0x25286581  // psel p1.h, p9.h/Z, p12.h[w12]\n"
       ".inst 0x25286160  // psel p0.h, p8.h/Z, p11.h[w12]\n"
-      ".inst 0xe0560520  // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n"
-      ".inst 0xe0560348  // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n"
+      ".inst 0xe0570520  // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n"
+      ".inst 0xe0570348  // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n"
       ".inst 0x25686581  // psel p1.h, p9.h/Z, p12.h[w12, #2]\n"
       ".inst 0x25686160  // psel p0.h, p8.h/Z, p11.h[w12, #2]\n"
       "mov x11, %x[in]\n"
       "add x10, %x[in], x16, LSL #3\n"
-      ".inst 0xe0560702  // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n"
+      ".inst 0xe0570702  // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n"
       "ldr x9, [x11, #0x0]\n"
-      ".inst 0xe05602ea  // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n"
+      ".inst 0xe05702aa  // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n"
       "ldr x26, [x10, #0x0]\n"
-      "inch x22\n"
+      "inch x23\n"
       "inch x14\n"
       "ldr x24, [x11, #0x8]\n"
       "add x11, x11, #0x10\n"
-      "ldr x23, [x10, #0x8]\n"
+      "ldr x21, [x10, #0x8]\n"
       "add x10, x10, #0x10\n"
       "cbz x13, 8f\n"
       "mov x20, x13\n"
@@ -121,60 +119,60 @@
       "4:"  // K loop: Main loop: First: Loop
       ".inst 0x25396581  // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
       ".inst 0x25396160  // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
-      ".inst 0xe0562521  // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n"
+      ".inst 0xe0572521  // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
       "ldr x9, [x11, #0x0]\n"
-      ".inst 0xe0562349  // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n"
+      ".inst 0xe0572349  // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
       ".inst 0x25796580  // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
       ".inst 0x25796162  // psel p2.h, p8.h/Z, p11.h[w13, #3]\n"
       "ldr x26, [x10, #0x0]\n"
       ".inst 0x25307541  // psel p1.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0562303  // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n"
+      ".inst 0xe0572303  // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
       "ldr x24, [x11, #0x8]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0562aeb  // ld1h { za1h.h[x13, #3] }, p2/Z, [x23, x22, LSL #1]\n"
-      "ldr x23, [x10, #0x8]\n"
-      ".inst 0xe0bf86a0  // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0572aab  // ld1h { za1h.h[x13, #3] }, p2/Z, [x21, x23, LSL #1]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf86c0  // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
-      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
       ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
       "add x11, x11, #0x10\n"
-      ".inst 0xe0bc86a1  // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      ".inst 0xe0bc86c1  // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
       "add x10, x10, #0x10\n"
       "add x13, x13, #0x4\n"
-      ".inst 0xe0bb82a5  // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+      ".inst 0xe0bb82c5  // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
       "add x12, x12, #0x2\n"
       "cmp x12, x15\n"
-      "addvl x21, x21, #4\n"
+      "addvl x22, x22, #4\n"
       "blt 4b\n"
       "5:"  // K loop: Main loop: First: Tail
       ".inst 0x25396581  // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
       ".inst 0x25396160  // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
-      ".inst 0xe0562521  // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n"
-      ".inst 0xe0562349  // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n"
+      ".inst 0xe0572521  // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n"
+      ".inst 0xe0572349  // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n"
       "mov x11, %x[in]\n"
       "add x10, %x[in], x16, LSL #3\n"
       "ldr x9, [x11, #0x0]\n"
       ".inst 0x25796580  // psel p0.h, p9.h/Z, p12.h[w13, #3]\n"
       ".inst 0x25796161  // psel p1.h, p8.h/Z, p11.h[w13, #3]\n"
-      ".inst 0xe0562303  // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n"
+      ".inst 0xe0572303  // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n"
       "ldr x26, [x10, #0x0]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe05626eb  // ld1h { za1h.h[x13, #3] }, p1/Z, [x23, x22, LSL #1]\n"
+      ".inst 0xe05726ab  // ld1h { za1h.h[x13, #3] }, p1/Z, [x21, x23, LSL #1]\n"
       "ldr x24, [x11, #0x8]\n"
       ".inst 0x25307542  // psel p2.s, p13.s/Z, p10.s[w12]\n"
-      "ldr x23, [x10, #0x8]\n"
-      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
       ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
-      ".inst 0xe0b08aa4  // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b08ac4  // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
       "whilelt p10.h, x14, %x[width]\n"
       "inch x14\n"
-      ".inst 0xe0bc86a1  // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      ".inst 0xe0bc86c1  // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
       "add x11, x11, #0x10\n"
       "add x10, x10, #0x10\n"
-      ".inst 0xe0bb82a5  // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
-      "addvl x21, x21, #4\n"
-      "inch x22\n"
+      ".inst 0xe0bb82c5  // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+      "addvl x22, x22, #4\n"
+      "inch x23\n"
       "whilelt p9.h, x14, %x[width]\n"
       "whilelt p8.h, x14, %x[width]\n"
       "mov x13, #0x0\n"
@@ -183,61 +181,61 @@
       "6:"  // K loop: Main loop: Second: Loop
       ".inst 0x25296581  // psel p1.h, p9.h/Z, p12.h[w13]\n"
       ".inst 0x25296160  // psel p0.h, p8.h/Z, p11.h[w13]\n"
-      ".inst 0xe0562520  // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n"
+      ".inst 0xe0572520  // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
       "ldr x9, [x11, #0x0]\n"
-      ".inst 0xe0562348  // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n"
+      ".inst 0xe0572348  // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
       ".inst 0x25696580  // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
       ".inst 0x25696162  // psel p2.h, p8.h/Z, p11.h[w13, #2]\n"
       "ldr x26, [x10, #0x0]\n"
       ".inst 0x25307541  // psel p1.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0562302  // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n"
+      ".inst 0xe0572302  // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
       "ldr x24, [x11, #0x8]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0562aea  // ld1h { za1h.h[x13, #2] }, p2/Z, [x23, x22, LSL #1]\n"
-      "ldr x23, [x10, #0x8]\n"
-      ".inst 0xe0bf86a8  // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0572aaa  // ld1h { za1h.h[x13, #2] }, p2/Z, [x21, x23, LSL #1]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf86c8  // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
-      ".inst 0xe0b082ac  // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b082cc  // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
       ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
       "add x11, x11, #0x10\n"
-      ".inst 0xe0bc86a9  // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      ".inst 0xe0bc86c9  // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
       "add x10, x10, #0x10\n"
       "add x13, x13, #0x4\n"
-      ".inst 0xe0bb82ad  // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
+      ".inst 0xe0bb82cd  // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
       "add x12, x12, #0x2\n"
       "cmp x12, x15\n"
-      "addvl x21, x21, #4\n"
+      "addvl x22, x22, #4\n"
       "blt 6b\n"
       "7:"  // K loop: Main loop: Second: Tail
       ".inst 0x25296581  // psel p1.h, p9.h/Z, p12.h[w13]\n"
       ".inst 0x25296160  // psel p0.h, p8.h/Z, p11.h[w13]\n"
-      ".inst 0xe0562520  // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n"
-      ".inst 0xe0562348  // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n"
+      ".inst 0xe0572520  // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n"
+      ".inst 0xe0572348  // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n"
       "mov x11, %x[in]\n"
       "add x10, %x[in], x16, LSL #3\n"
       "ldr x9, [x11, #0x0]\n"
       ".inst 0x25696580  // psel p0.h, p9.h/Z, p12.h[w13, #2]\n"
       ".inst 0x25696161  // psel p1.h, p8.h/Z, p11.h[w13, #2]\n"
-      ".inst 0xe0562302  // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n"
+      ".inst 0xe0572302  // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n"
       "ldr x26, [x10, #0x0]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe05626ea  // ld1h { za1h.h[x13, #2] }, p1/Z, [x23, x22, LSL #1]\n"
+      ".inst 0xe05726aa  // ld1h { za1h.h[x13, #2] }, p1/Z, [x21, x23, LSL #1]\n"
       "ldr x24, [x11, #0x8]\n"
       ".inst 0x25307542  // psel p2.s, p13.s/Z, p10.s[w12]\n"
-      "ldr x23, [x10, #0x8]\n"
-      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "ldr x21, [x10, #0x8]\n"
+      ".inst 0xe0bf82c8  // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
       ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
-      ".inst 0xe0b08aac  // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b08acc  // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
       "whilelt p10.h, x14, %x[width]\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xe0bc86a9  // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      ".inst 0xe0bc86c9  // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
       "add x11, x11, #0x10\n"
       "add x10, x10, #0x10\n"
-      ".inst 0xe0bb82ad  // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n"
-      "addvl x21, x21, #4\n"
+      ".inst 0xe0bb82cd  // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n"
+      "addvl x22, x22, #4\n"
       "inch x14\n"
-      "inch x22\n"
+      "inch x23\n"
       "bgt 3b\n"
       "8:"  // K loop: Tails
       "cbnz x25, 11f\n"
@@ -248,51 +246,51 @@
       "mov x12, #0x0\n"
       "9:"  // K loop: Tails: Even: First
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
-      "ldr x9, [x11, #0x0]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      "ldr x21, [x11, #0x0]\n"
       "add x12, x12, #0x1\n"
       ".inst 0x25396581  // psel p1.h, p9.h/Z, p12.h[w13, #1]\n"
-      "ldr x26, [x11, x16, LSL #0x3]\n"
+      "ldr x20, [x11, x16, LSL #0x3]\n"
       ".inst 0x25396160  // psel p0.h, p8.h/Z, p11.h[w13, #1]\n"
       "cmp x12, x16\n"
-      ".inst 0xe0562521  // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n"
-      ".inst 0xe0562349  // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n"
+      ".inst 0xe05726a1  // ld1h { za0h.h[x13, #1] }, p1/Z, [x21, x23, LSL #1]\n"
+      ".inst 0xe0572289  // ld1h { za1h.h[x13, #1] }, p0/Z, [x20, x23, LSL #1]\n"
       "add x11, x11, #0x8\n"
-      "addvl x21, x21, #2\n"
+      "addvl x22, x22, #2\n"
       "add x13, x13, #0x2\n"
       "blt 9b\n"
       "whilelt p10.h, x14, %x[width]\n"
-      "whilelt p9.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
       "whilelt p8.h, x14, %x[width]\n"
       "mov x20, #0x0\n"
       "mov x12, #0x0\n"
       "10:"  // K loop: Tails: Even: Second
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0bf82c8  // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0b082ac  // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b082cc  // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x17\n"
-      "addvl x21, x21, #2\n"
+      "addvl x22, x22, #2\n"
       "add x20, x20, #0x2\n"
       "blt 10b\n"
-      "whilelt p10.h, x14, %x[width]\n"
+      "whilelt p8.h, x14, %x[width]\n"
       "b 13f\n"
       "11:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
       "12:"  // K loop: Tails: Odd: Loop
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x17\n"
-      "addvl x21, x21, #2\n"
+      "addvl x22, x22, #2\n"
       "blt 12b\n"
       "13:"  // K loop: End
-      "mov %x[out], x21\n"
+      "mov %x[out], x22\n"
       ".inst 0xd503467f  // SMSTOP\n"
       : [out] "+&r" (out)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
@@ -300,4 +298,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp
index 4c701cf..905c6b4 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<2, 4, VLType::SME, false>(
   int8_t * &out, const int8_t * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntb x21\n"
@@ -248,13 +246,13 @@
       ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
-      "ldr x9, [x11, #0x0]\n"
+      "ldr x20, [x11, #0x0]\n"
       ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
-      ".inst 0xe0162122  // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n"
-      "ldr x26, [x11, x16, LSL #0x3]\n"
+      ".inst 0xe0162282  // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
+      "ldr x20, [x11, x16, LSL #0x3]\n"
       "add x12, x12, #0x1\n"
       ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
-      ".inst 0xe0162343  // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n"
+      ".inst 0xe0162283  // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x22]\n"
       "cmp x12, x16\n"
       "add x11, x11, #0x8\n"
       "addvl x21, x21, #2\n"
@@ -274,7 +272,7 @@
       "addvl x21, x21, #2\n"
       "add x20, x20, #0x4\n"
       "blt 10b\n"
-      "whilelt p9.b, x14, %x[width]\n"
+      "whilelt p8.b, x14, %x[width]\n"
       "b 13f\n"
       "11:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
@@ -296,4 +294,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp
index 25262d3..c5c5af2 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<2, 4, VLType::SME, true>(
@@ -140,23 +140,23 @@
       ".inst 0xe01c2aa7  // ld1b { za0h.b[x13, #7] }, p2/Z, [x21, x28]\n"
       "ldr x21, [x25, #0x8]\n"
       ".inst 0xe0bf8760  // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
-      ".inst 0xc0829010  // mova z16.s, p4/M, za0v.s[x12]\n"
-      ".inst 0xc0829091  // mova z17.s, p4/M, za1v.s[x12]\n"
-      "sdot z19.s, z16.b, z20.b\n"
+      ".inst 0xc0829011  // mova z17.s, p4/M, za0v.s[x12]\n"
+      ".inst 0xc0829090  // mova z16.s, p4/M, za1v.s[x12]\n"
+      "sdot z19.s, z17.b, z20.b\n"
       ".inst 0xe0b08364  // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
       ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
-      "sdot z18.s, z17.b, z20.b\n"
+      "sdot z18.s, z16.b, z20.b\n"
       ".inst 0xe0ae8361  // st1w { za0v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
       ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
-      ".inst 0xc0829030  // mova z16.s, p4/M, za0v.s[x12, #1]\n"
+      ".inst 0xc0829031  // mova z17.s, p4/M, za0v.s[x12, #1]\n"
       ".inst 0xe0ab8365  // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
-      ".inst 0xc08290b1  // mova z17.s, p4/M, za1v.s[x12, #1]\n"
+      ".inst 0xc08290b0  // mova z16.s, p4/M, za1v.s[x12, #1]\n"
       "add x12, x12, #0x2\n"
       "cmp x12, x9\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "sdot z19.s, z16.b, z20.b\n"
-      "sdot z18.s, z17.b, z20.b\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      "sdot z18.s, z16.b, z20.b\n"
       "addvl x27, x27, #4\n"
       "add x13, x13, #0x8\n"
       "blt 5b\n"
@@ -172,28 +172,28 @@
       "add x25, %x[in], x16, LSL #3\n"
       "ldr x24, [x26, #0x0]\n"
       ".inst 0xe01c22a7  // ld1b { za0h.b[x13, #7] }, p0/Z, [x21, x28]\n"
-      ".inst 0xc0829010  // mova z16.s, p4/M, za0v.s[x12]\n"
+      ".inst 0xc0829011  // mova z17.s, p4/M, za0v.s[x12]\n"
       ".inst 0x25306d23  // psel p3.s, p11.s/Z, p9.s[w12]\n"
-      "sdot z19.s, z16.b, z20.b\n"
-      ".inst 0xc0829091  // mova z17.s, p4/M, za1v.s[x12]\n"
-      "sdot z18.s, z17.b, z20.b\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      ".inst 0xc0829090  // mova z16.s, p4/M, za1v.s[x12]\n"
+      "sdot z18.s, z16.b, z20.b\n"
       "ldr x23, [x25, #0x0]\n"
       ".inst 0x25306d22  // psel p2.s, p11.s/Z, p9.s[w12]\n"
       "ldr x22, [x26, #0x8]\n"
       ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
-      ".inst 0xc0829030  // mova z16.s, p4/M, za0v.s[x12, #1]\n"
+      ".inst 0xc0829031  // mova z17.s, p4/M, za0v.s[x12, #1]\n"
       ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
       "ldr x21, [x25, #0x8]\n"
       ".inst 0xe0bf8f60  // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
-      ".inst 0xc08290b1  // mova z17.s, p4/M, za1v.s[x12, #1]\n"
+      ".inst 0xc08290b0  // mova z16.s, p4/M, za1v.s[x12, #1]\n"
       "whilelt p9.b, x15, %x[width]\n"
       ".inst 0xe0b08b64  // st1w { za1v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
       "incb x15\n"
       "add x26, x26, #0x10\n"
-      "sdot z19.s, z16.b, z20.b\n"
+      "sdot z19.s, z17.b, z20.b\n"
       ".inst 0xe0ae8761  // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
       "add x25, x25, #0x10\n"
-      "sdot z18.s, z17.b, z20.b\n"
+      "sdot z18.s, z16.b, z20.b\n"
       "incb x28\n"
       ".inst 0xe0ab8365  // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
       "addvl x27, x27, #4\n"
@@ -217,23 +217,23 @@
       ".inst 0xe01c2aa5  // ld1b { za0h.b[x13, #5] }, p2/Z, [x21, x28]\n"
       "ldr x21, [x25, #0x8]\n"
       ".inst 0xe0bf8768  // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
-      ".inst 0xc0829110  // mova z16.s, p4/M, za2v.s[x12]\n"
-      ".inst 0xc0829191  // mova z17.s, p4/M, za3v.s[x12]\n"
-      "sdot z19.s, z16.b, z20.b\n"
+      ".inst 0xc0829111  // mova z17.s, p4/M, za2v.s[x12]\n"
+      ".inst 0xc0829190  // mova z16.s, p4/M, za3v.s[x12]\n"
+      "sdot z19.s, z17.b, z20.b\n"
       ".inst 0xe0b0836c  // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
       ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
-      "sdot z18.s, z17.b, z20.b\n"
+      "sdot z18.s, z16.b, z20.b\n"
       ".inst 0xe0ae8369  // st1w { za2v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
       ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
-      ".inst 0xc0829130  // mova z16.s, p4/M, za2v.s[x12, #1]\n"
+      ".inst 0xc0829131  // mova z17.s, p4/M, za2v.s[x12, #1]\n"
       ".inst 0xe0ab836d  // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
-      ".inst 0xc08291b1  // mova z17.s, p4/M, za3v.s[x12, #1]\n"
+      ".inst 0xc08291b0  // mova z16.s, p4/M, za3v.s[x12, #1]\n"
       "add x12, x12, #0x2\n"
       "cmp x12, x9\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "sdot z19.s, z16.b, z20.b\n"
-      "sdot z18.s, z17.b, z20.b\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      "sdot z18.s, z16.b, z20.b\n"
       "addvl x27, x27, #4\n"
       "add x13, x13, #0x8\n"
       "blt 7b\n"
@@ -249,28 +249,28 @@
       "add x25, %x[in], x16, LSL #3\n"
       "ldr x24, [x26, #0x0]\n"
       ".inst 0xe01c22a5  // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n"
-      ".inst 0xc0829110  // mova z16.s, p4/M, za2v.s[x12]\n"
+      ".inst 0xc0829111  // mova z17.s, p4/M, za2v.s[x12]\n"
       ".inst 0x25306d23  // psel p3.s, p11.s/Z, p9.s[w12]\n"
-      "sdot z19.s, z16.b, z20.b\n"
-      ".inst 0xc0829191  // mova z17.s, p4/M, za3v.s[x12]\n"
-      "sdot z18.s, z17.b, z20.b\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      ".inst 0xc0829190  // mova z16.s, p4/M, za3v.s[x12]\n"
+      "sdot z18.s, z16.b, z20.b\n"
       "ldr x23, [x25, #0x0]\n"
       ".inst 0x25306d22  // psel p2.s, p11.s/Z, p9.s[w12]\n"
       "ldr x22, [x26, #0x8]\n"
       ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
-      ".inst 0xc0829130  // mova z16.s, p4/M, za2v.s[x12, #1]\n"
+      ".inst 0xc0829131  // mova z17.s, p4/M, za2v.s[x12, #1]\n"
       ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
       "ldr x21, [x25, #0x8]\n"
       ".inst 0xe0bf8f68  // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
-      ".inst 0xc08291b1  // mova z17.s, p4/M, za3v.s[x12, #1]\n"
+      ".inst 0xc08291b0  // mova z16.s, p4/M, za3v.s[x12, #1]\n"
       "whilelt p9.b, x15, %x[width]\n"
       ".inst 0xe0b08b6c  // st1w { za3v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
       "subs x20, x20, #0x1\n"
       "add x26, x26, #0x10\n"
-      "sdot z19.s, z16.b, z20.b\n"
+      "sdot z19.s, z17.b, z20.b\n"
       ".inst 0xe0ae8769  // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
       "add x25, x25, #0x10\n"
-      "sdot z18.s, z17.b, z20.b\n"
+      "sdot z18.s, z16.b, z20.b\n"
       "incb x15\n"
       ".inst 0xe0ab836d  // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
       "addvl x27, x27, #4\n"
@@ -286,19 +286,19 @@
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0bf8360  // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
-      ".inst 0xc0829010  // mova z16.s, p4/M, za0v.s[x12]\n"
+      ".inst 0xc0829011  // mova z17.s, p4/M, za0v.s[x12]\n"
       ".inst 0xe0b08364  // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
-      "ldr x24, [x26, #0x0]\n"
+      "ldr x21, [x26, #0x0]\n"
       ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
-      ".inst 0xc0829091  // mova z17.s, p4/M, za1v.s[x12]\n"
-      "ldr x23, [x26, x16, LSL #0x3]\n"
-      ".inst 0xe01c2302  // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n"
+      ".inst 0xc0829090  // mova z16.s, p4/M, za1v.s[x12]\n"
+      "ldr x20, [x26, x16, LSL #0x3]\n"
+      ".inst 0xe01c22a2  // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x28]\n"
       "add x12, x12, #0x1\n"
       ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
       "cmp x12, x16\n"
-      "sdot z19.s, z16.b, z20.b\n"
-      "sdot z18.s, z17.b, z20.b\n"
-      ".inst 0xe01c22e3  // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      "sdot z18.s, z16.b, z20.b\n"
+      ".inst 0xe01c2283  // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x28]\n"
       "add x26, x26, #0x8\n"
       "addvl x27, x27, #2\n"
       "add x13, x13, #0x4\n"
@@ -311,17 +311,17 @@
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0bf8368  // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
-      ".inst 0xc0829110  // mova z16.s, p4/M, za2v.s[x12]\n"
+      ".inst 0xc0829111  // mova z17.s, p4/M, za2v.s[x12]\n"
       ".inst 0xe0b0836c  // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
-      ".inst 0xc0829191  // mova z17.s, p4/M, za3v.s[x12]\n"
+      ".inst 0xc0829190  // mova z16.s, p4/M, za3v.s[x12]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x17\n"
-      "sdot z19.s, z16.b, z20.b\n"
-      "sdot z18.s, z17.b, z20.b\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      "sdot z18.s, z16.b, z20.b\n"
       "addvl x27, x27, #2\n"
       "add x20, x20, #0x4\n"
       "blt 11b\n"
-      "whilelt p9.b, x15, %x[width]\n"
+      "whilelt p8.b, x15, %x[width]\n"
       "b 14f\n"
       "12:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
@@ -329,13 +329,13 @@
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0bf8360  // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
-      ".inst 0xc0829010  // mova z16.s, p4/M, za0v.s[x12]\n"
+      ".inst 0xc0829011  // mova z17.s, p4/M, za0v.s[x12]\n"
       ".inst 0xe0b08364  // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
-      ".inst 0xc0829091  // mova z17.s, p4/M, za1v.s[x12]\n"
+      ".inst 0xc0829090  // mova z16.s, p4/M, za1v.s[x12]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x17\n"
-      "sdot z19.s, z16.b, z20.b\n"
-      "sdot z18.s, z17.b, z20.b\n"
+      "sdot z19.s, z17.b, z20.b\n"
+      "sdot z18.s, z16.b, z20.b\n"
       "addvl x27, x27, #2\n"
       "blt 13b\n"
       "14:"  // K loop: End
@@ -350,4 +350,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp
index 683a315..ce9a006 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<2, 4, VLType::SME, false>(
   uint8_t * &out, const uint8_t * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntb x21\n"
@@ -248,13 +246,13 @@
       ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
-      "ldr x9, [x11, #0x0]\n"
+      "ldr x20, [x11, #0x0]\n"
       ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
-      ".inst 0xe0162122  // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n"
-      "ldr x26, [x11, x16, LSL #0x3]\n"
+      ".inst 0xe0162282  // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n"
+      "ldr x20, [x11, x16, LSL #0x3]\n"
       "add x12, x12, #0x1\n"
       ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
-      ".inst 0xe0162343  // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n"
+      ".inst 0xe0162283  // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x22]\n"
       "cmp x12, x16\n"
       "add x11, x11, #0x8\n"
       "addvl x21, x21, #2\n"
@@ -274,7 +272,7 @@
       "addvl x21, x21, #2\n"
       "add x20, x20, #0x4\n"
       "blt 10b\n"
-      "whilelt p9.b, x14, %x[width]\n"
+      "whilelt p8.b, x14, %x[width]\n"
       "b 13f\n"
       "11:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
@@ -296,4 +294,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
index e7571f7..7805152 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<2, 4, VLType::SME, true>(
@@ -140,23 +140,23 @@
       ".inst 0xe01c2aa7  // ld1b { za0h.b[x13, #7] }, p2/Z, [x21, x28]\n"
       "ldr x21, [x25, #0x8]\n"
       ".inst 0xe0bf8760  // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
-      ".inst 0xc0829011  // mova z17.s, p4/M, za0v.s[x12]\n"
-      ".inst 0xc0829090  // mova z16.s, p4/M, za1v.s[x12]\n"
-      "udot z19.s, z17.b, z20.b\n"
+      ".inst 0xc0829010  // mova z16.s, p4/M, za0v.s[x12]\n"
+      ".inst 0xc0829091  // mova z17.s, p4/M, za1v.s[x12]\n"
+      "udot z19.s, z16.b, z20.b\n"
       ".inst 0xe0b08364  // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
       ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
-      "udot z18.s, z16.b, z20.b\n"
+      "udot z18.s, z17.b, z20.b\n"
       ".inst 0xe0ae8361  // st1w { za0v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
       ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
-      ".inst 0xc0829031  // mova z17.s, p4/M, za0v.s[x12, #1]\n"
+      ".inst 0xc0829030  // mova z16.s, p4/M, za0v.s[x12, #1]\n"
       ".inst 0xe0ab8365  // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
-      ".inst 0xc08290b0  // mova z16.s, p4/M, za1v.s[x12, #1]\n"
+      ".inst 0xc08290b1  // mova z17.s, p4/M, za1v.s[x12, #1]\n"
       "add x12, x12, #0x2\n"
       "cmp x12, x9\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "udot z19.s, z17.b, z20.b\n"
-      "udot z18.s, z16.b, z20.b\n"
+      "udot z19.s, z16.b, z20.b\n"
+      "udot z18.s, z17.b, z20.b\n"
       "addvl x27, x27, #4\n"
       "add x13, x13, #0x8\n"
       "blt 5b\n"
@@ -172,28 +172,28 @@
       "add x25, %x[in], x16, LSL #3\n"
       "ldr x24, [x26, #0x0]\n"
       ".inst 0xe01c22a7  // ld1b { za0h.b[x13, #7] }, p0/Z, [x21, x28]\n"
-      ".inst 0xc0829011  // mova z17.s, p4/M, za0v.s[x12]\n"
+      ".inst 0xc0829010  // mova z16.s, p4/M, za0v.s[x12]\n"
       ".inst 0x25306d23  // psel p3.s, p11.s/Z, p9.s[w12]\n"
-      "udot z19.s, z17.b, z20.b\n"
-      ".inst 0xc0829090  // mova z16.s, p4/M, za1v.s[x12]\n"
-      "udot z18.s, z16.b, z20.b\n"
+      "udot z19.s, z16.b, z20.b\n"
+      ".inst 0xc0829091  // mova z17.s, p4/M, za1v.s[x12]\n"
+      "udot z18.s, z17.b, z20.b\n"
       "ldr x23, [x25, #0x0]\n"
       ".inst 0x25306d22  // psel p2.s, p11.s/Z, p9.s[w12]\n"
       "ldr x22, [x26, #0x8]\n"
       ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
-      ".inst 0xc0829031  // mova z17.s, p4/M, za0v.s[x12, #1]\n"
+      ".inst 0xc0829030  // mova z16.s, p4/M, za0v.s[x12, #1]\n"
       ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
       "ldr x21, [x25, #0x8]\n"
       ".inst 0xe0bf8f60  // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
-      ".inst 0xc08290b0  // mova z16.s, p4/M, za1v.s[x12, #1]\n"
+      ".inst 0xc08290b1  // mova z17.s, p4/M, za1v.s[x12, #1]\n"
       "whilelt p9.b, x15, %x[width]\n"
       ".inst 0xe0b08b64  // st1w { za1v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
       "incb x15\n"
       "add x26, x26, #0x10\n"
-      "udot z19.s, z17.b, z20.b\n"
+      "udot z19.s, z16.b, z20.b\n"
       ".inst 0xe0ae8761  // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
       "add x25, x25, #0x10\n"
-      "udot z18.s, z16.b, z20.b\n"
+      "udot z18.s, z17.b, z20.b\n"
       "incb x28\n"
       ".inst 0xe0ab8365  // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
       "addvl x27, x27, #4\n"
@@ -217,23 +217,23 @@
       ".inst 0xe01c2aa5  // ld1b { za0h.b[x13, #5] }, p2/Z, [x21, x28]\n"
       "ldr x21, [x25, #0x8]\n"
       ".inst 0xe0bf8768  // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n"
-      ".inst 0xc0829111  // mova z17.s, p4/M, za2v.s[x12]\n"
-      ".inst 0xc0829190  // mova z16.s, p4/M, za3v.s[x12]\n"
-      "udot z19.s, z17.b, z20.b\n"
+      ".inst 0xc0829110  // mova z16.s, p4/M, za2v.s[x12]\n"
+      ".inst 0xc0829191  // mova z17.s, p4/M, za3v.s[x12]\n"
+      "udot z19.s, z16.b, z20.b\n"
       ".inst 0xe0b0836c  // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
       ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
-      "udot z18.s, z16.b, z20.b\n"
+      "udot z18.s, z17.b, z20.b\n"
       ".inst 0xe0ae8369  // st1w { za2v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n"
       ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
-      ".inst 0xc0829131  // mova z17.s, p4/M, za2v.s[x12, #1]\n"
+      ".inst 0xc0829130  // mova z16.s, p4/M, za2v.s[x12, #1]\n"
       ".inst 0xe0ab836d  // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
-      ".inst 0xc08291b0  // mova z16.s, p4/M, za3v.s[x12, #1]\n"
+      ".inst 0xc08291b1  // mova z17.s, p4/M, za3v.s[x12, #1]\n"
       "add x12, x12, #0x2\n"
       "cmp x12, x9\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "udot z19.s, z17.b, z20.b\n"
-      "udot z18.s, z16.b, z20.b\n"
+      "udot z19.s, z16.b, z20.b\n"
+      "udot z18.s, z17.b, z20.b\n"
       "addvl x27, x27, #4\n"
       "add x13, x13, #0x8\n"
       "blt 7b\n"
@@ -249,28 +249,28 @@
       "add x25, %x[in], x16, LSL #3\n"
       "ldr x24, [x26, #0x0]\n"
       ".inst 0xe01c22a5  // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n"
-      ".inst 0xc0829111  // mova z17.s, p4/M, za2v.s[x12]\n"
+      ".inst 0xc0829110  // mova z16.s, p4/M, za2v.s[x12]\n"
       ".inst 0x25306d23  // psel p3.s, p11.s/Z, p9.s[w12]\n"
-      "udot z19.s, z17.b, z20.b\n"
-      ".inst 0xc0829190  // mova z16.s, p4/M, za3v.s[x12]\n"
-      "udot z18.s, z16.b, z20.b\n"
+      "udot z19.s, z16.b, z20.b\n"
+      ".inst 0xc0829191  // mova z17.s, p4/M, za3v.s[x12]\n"
+      "udot z18.s, z17.b, z20.b\n"
       "ldr x23, [x25, #0x0]\n"
       ".inst 0x25306d22  // psel p2.s, p11.s/Z, p9.s[w12]\n"
       "ldr x22, [x26, #0x8]\n"
       ".inst 0x25706d21  // psel p1.s, p11.s/Z, p9.s[w12, #1]\n"
-      ".inst 0xc0829131  // mova z17.s, p4/M, za2v.s[x12, #1]\n"
+      ".inst 0xc0829130  // mova z16.s, p4/M, za2v.s[x12, #1]\n"
       ".inst 0x25706d20  // psel p0.s, p11.s/Z, p9.s[w12, #1]\n"
       "ldr x21, [x25, #0x8]\n"
       ".inst 0xe0bf8f68  // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n"
-      ".inst 0xc08291b0  // mova z16.s, p4/M, za3v.s[x12, #1]\n"
+      ".inst 0xc08291b1  // mova z17.s, p4/M, za3v.s[x12, #1]\n"
       "whilelt p9.b, x15, %x[width]\n"
       ".inst 0xe0b08b6c  // st1w { za3v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n"
       "subs x20, x20, #0x1\n"
       "add x26, x26, #0x10\n"
-      "udot z19.s, z17.b, z20.b\n"
+      "udot z19.s, z16.b, z20.b\n"
       ".inst 0xe0ae8769  // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n"
       "add x25, x25, #0x10\n"
-      "udot z18.s, z16.b, z20.b\n"
+      "udot z18.s, z17.b, z20.b\n"
       "incb x15\n"
       ".inst 0xe0ab836d  // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n"
       "addvl x27, x27, #4\n"
@@ -286,19 +286,19 @@
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0bf8360  // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
-      ".inst 0xc0829011  // mova z17.s, p4/M, za0v.s[x12]\n"
+      ".inst 0xc0829010  // mova z16.s, p4/M, za0v.s[x12]\n"
       ".inst 0xe0b08364  // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
-      "ldr x24, [x26, #0x0]\n"
+      "ldr x21, [x26, #0x0]\n"
       ".inst 0x25356140  // psel p0.b, p8.b/Z, p10.b[w13, #2]\n"
-      ".inst 0xc0829090  // mova z16.s, p4/M, za1v.s[x12]\n"
-      "ldr x23, [x26, x16, LSL #0x3]\n"
-      ".inst 0xe01c2302  // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n"
+      ".inst 0xc0829091  // mova z17.s, p4/M, za1v.s[x12]\n"
+      "ldr x20, [x26, x16, LSL #0x3]\n"
+      ".inst 0xe01c22a2  // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x28]\n"
       "add x12, x12, #0x1\n"
       ".inst 0x253d6140  // psel p0.b, p8.b/Z, p10.b[w13, #3]\n"
       "cmp x12, x16\n"
-      "udot z19.s, z17.b, z20.b\n"
-      "udot z18.s, z16.b, z20.b\n"
-      ".inst 0xe01c22e3  // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n"
+      "udot z19.s, z16.b, z20.b\n"
+      "udot z18.s, z17.b, z20.b\n"
+      ".inst 0xe01c2283  // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x28]\n"
       "add x26, x26, #0x8\n"
       "addvl x27, x27, #2\n"
       "add x13, x13, #0x4\n"
@@ -311,17 +311,17 @@
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0bf8368  // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
-      ".inst 0xc0829111  // mova z17.s, p4/M, za2v.s[x12]\n"
+      ".inst 0xc0829110  // mova z16.s, p4/M, za2v.s[x12]\n"
       ".inst 0xe0b0836c  // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
-      ".inst 0xc0829190  // mova z16.s, p4/M, za3v.s[x12]\n"
+      ".inst 0xc0829191  // mova z17.s, p4/M, za3v.s[x12]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x17\n"
-      "udot z19.s, z17.b, z20.b\n"
-      "udot z18.s, z16.b, z20.b\n"
+      "udot z19.s, z16.b, z20.b\n"
+      "udot z18.s, z17.b, z20.b\n"
       "addvl x27, x27, #2\n"
       "add x20, x20, #0x4\n"
       "blt 11b\n"
-      "whilelt p9.b, x15, %x[width]\n"
+      "whilelt p8.b, x15, %x[width]\n"
       "b 14f\n"
       "12:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
@@ -329,13 +329,13 @@
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0bf8360  // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n"
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
-      ".inst 0xc0829011  // mova z17.s, p4/M, za0v.s[x12]\n"
+      ".inst 0xc0829010  // mova z16.s, p4/M, za0v.s[x12]\n"
       ".inst 0xe0b08364  // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n"
-      ".inst 0xc0829090  // mova z16.s, p4/M, za1v.s[x12]\n"
+      ".inst 0xc0829091  // mova z17.s, p4/M, za1v.s[x12]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x17\n"
-      "udot z19.s, z17.b, z20.b\n"
-      "udot z18.s, z16.b, z20.b\n"
+      "udot z19.s, z16.b, z20.b\n"
+      "udot z18.s, z17.b, z20.b\n"
       "addvl x27, x27, #2\n"
       "blt 13b\n"
       "14:"  // K loop: End
@@ -350,4 +350,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp
index 522f310..96ab55e 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<2, 1, VLType::SME, false>(
   __fp16 * &out, const __fp16 * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cnth x28\n"
@@ -97,4 +95,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp
index 949e003..ac4b1b5 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<2, 1, VLType::SME, false>(
   float * &out, const float * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "mov x22, %x[width]\n"
@@ -55,12 +53,12 @@
       "ldr x25, [x11, #0x8]\n"
       "and x24, x22, #0x1\n"  // odd_tail = bool(n_passes & 0x1)
       "csel x15, x15, x16, NE\n"
-      "ldr x23, [x9, #0x8]\n"
+      "ldr x21, [x9, #0x8]\n"
       "ptrue p13.s\n"
       "whilelt p12.s, XZR, %x[height]\n"
       "whilelt p11.s, x16, %x[height]\n"
-      "mov x22, %x[row_offset]\n"
-      "mov x21, %x[out]\n"
+      "mov x23, %x[row_offset]\n"
+      "mov x22, %x[out]\n"
       "whilelt p10.s, x13, %x[width]\n"
       "whilelt p9.s, x13, %x[width]\n"
       "whilelt p8.s, x13, %x[width]\n"
@@ -71,39 +69,39 @@
       "1:"  // K loop: Charge: Loop
       ".inst 0x25306581  // psel p1.s, p9.s/Z, p12.s[w12]\n"
       ".inst 0x25306160  // psel p0.s, p8.s/Z, p11.s[w12]\n"
-      ".inst 0xe0960540  // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n"
+      ".inst 0xe0970540  // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
       "ldr x10, [x11, #0x0]\n"
-      ".inst 0xe0960364  // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n"
+      ".inst 0xe0970364  // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
       ".inst 0x25706581  // psel p1.s, p9.s/Z, p12.s[w12, #1]\n"
       ".inst 0x25706160  // psel p0.s, p8.s/Z, p11.s[w12, #1]\n"
       "ldr x27, [x9, #0x0]\n"
-      ".inst 0xe0960721  // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x22, LSL #2]\n"
+      ".inst 0xe0970721  // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x23, LSL #2]\n"
       "ldr x25, [x11, #0x8]\n"
       "add x11, x11, #0x10\n"
-      ".inst 0xe09602e5  // ld1w { za1h.s[x12, #1] }, p0/Z, [x23, x22, LSL #2]\n"
+      ".inst 0xe09702a5  // ld1w { za1h.s[x12, #1] }, p0/Z, [x21, x23, LSL #2]\n"
       "add x12, x12, #0x2\n"
       "cmp x12, x14\n"
-      "ldr x23, [x9, #0x8]\n"
+      "ldr x21, [x9, #0x8]\n"
       "add x9, x9, #0x10\n"
       "blt 1b\n"
       "2:"  // K loop: Charge: End
       ".inst 0x25306581  // psel p1.s, p9.s/Z, p12.s[w12]\n"
       ".inst 0x25306160  // psel p0.s, p8.s/Z, p11.s[w12]\n"
-      ".inst 0xe0960540  // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n"
-      ".inst 0xe0960364  // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n"
+      ".inst 0xe0970540  // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+      ".inst 0xe0970364  // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
       ".inst 0x25706581  // psel p1.s, p9.s/Z, p12.s[w12, #1]\n"
       ".inst 0x25706160  // psel p0.s, p8.s/Z, p11.s[w12, #1]\n"
       "mov x11, %x[in]\n"
       "add x9, %x[in], x16, LSL #3\n"
-      ".inst 0xe0960721  // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x22, LSL #2]\n"
+      ".inst 0xe0970721  // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x23, LSL #2]\n"
       "ldr x10, [x11, #0x0]\n"
-      ".inst 0xe09602e5  // ld1w { za1h.s[x12, #1] }, p0/Z, [x23, x22, LSL #2]\n"
+      ".inst 0xe09702a5  // ld1w { za1h.s[x12, #1] }, p0/Z, [x21, x23, LSL #2]\n"
       "ldr x27, [x9, #0x0]\n"
-      "incw x22\n"
+      "incw x23\n"
       "incw x13\n"
       "ldr x25, [x11, #0x8]\n"
       "add x11, x11, #0x10\n"
-      "ldr x23, [x9, #0x8]\n"
+      "ldr x21, [x9, #0x8]\n"
       "add x9, x9, #0x10\n"
       "cbz x20, 8f\n"
       "mov x20, x20\n"
@@ -115,59 +113,59 @@
       "4:"  // K loop: Main loop: First: Loop
       ".inst 0x25306581  // psel p1.s, p9.s/Z, p12.s[w12]\n"
       ".inst 0x25306160  // psel p0.s, p8.s/Z, p11.s[w12]\n"
-      ".inst 0xe0960548  // ld1w { za2h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n"
+      ".inst 0xe0970548  // ld1w { za2h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
       "ldr x10, [x11, #0x0]\n"
-      ".inst 0xe096036c  // ld1w { za3h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n"
+      ".inst 0xe097036c  // ld1w { za3h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
       ".inst 0x25706580  // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
       ".inst 0x25706162  // psel p2.s, p8.s/Z, p11.s[w12, #1]\n"
       "ldr x27, [x9, #0x0]\n"
       ".inst 0x25307541  // psel p1.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0960329  // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n"
+      ".inst 0xe0970329  // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
       "ldr x25, [x11, #0x8]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0960aed  // ld1w { za3h.s[x12, #1] }, p2/Z, [x23, x22, LSL #2]\n"
-      "ldr x23, [x9, #0x8]\n"
-      ".inst 0xe0bf86a0  // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0970aad  // ld1w { za3h.s[x12, #1] }, p2/Z, [x21, x23, LSL #2]\n"
+      "ldr x21, [x9, #0x8]\n"
+      ".inst 0xe0bf86c0  // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
-      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
       ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
       "add x11, x11, #0x10\n"
-      ".inst 0xe0bc86a1  // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      ".inst 0xe0bc86c1  // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
       "add x9, x9, #0x10\n"
-      ".inst 0xe0ba82a5  // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n"
+      ".inst 0xe0ba82c5  // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
       "add x12, x12, #0x2\n"
       "cmp x12, x14\n"
-      "addvl x21, x21, #4\n"
+      "addvl x22, x22, #4\n"
       "blt 4b\n"
       "5:"  // K loop: Main loop: First: Tail
       ".inst 0x25306581  // psel p1.s, p9.s/Z, p12.s[w12]\n"
       ".inst 0x25306160  // psel p0.s, p8.s/Z, p11.s[w12]\n"
-      ".inst 0xe0960548  // ld1w { za2h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n"
-      ".inst 0xe096036c  // ld1w { za3h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n"
+      ".inst 0xe0970548  // ld1w { za2h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+      ".inst 0xe097036c  // ld1w { za3h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
       "mov x11, %x[in]\n"
       "add x9, %x[in], x16, LSL #3\n"
       "ldr x10, [x11, #0x0]\n"
       ".inst 0x25706580  // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
       ".inst 0x25706161  // psel p1.s, p8.s/Z, p11.s[w12, #1]\n"
-      ".inst 0xe0960329  // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n"
+      ".inst 0xe0970329  // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
       "ldr x27, [x9, #0x0]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe09606ed  // ld1w { za3h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n"
+      ".inst 0xe09706ad  // ld1w { za3h.s[x12, #1] }, p1/Z, [x21, x23, LSL #2]\n"
       "ldr x25, [x11, #0x8]\n"
       ".inst 0x25307542  // psel p2.s, p13.s/Z, p10.s[w12]\n"
-      "ldr x23, [x9, #0x8]\n"
-      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "ldr x21, [x9, #0x8]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
       ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
-      ".inst 0xe0b08aa4  // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b08ac4  // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
       "whilelt p10.s, x13, %x[width]\n"
       "incw x13\n"
-      ".inst 0xe0bc86a1  // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      ".inst 0xe0bc86c1  // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
       "add x11, x11, #0x10\n"
       "add x9, x9, #0x10\n"
-      ".inst 0xe0ba82a5  // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n"
-      "addvl x21, x21, #4\n"
-      "incw x22\n"
+      ".inst 0xe0ba82c5  // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
+      "addvl x22, x22, #4\n"
+      "incw x23\n"
       "whilelt p9.s, x13, %x[width]\n"
       "whilelt p8.s, x13, %x[width]\n"
       "mov x12, #0x0\n"
@@ -175,60 +173,60 @@
       "6:"  // K loop: Main loop: Second: Loop
       ".inst 0x25306581  // psel p1.s, p9.s/Z, p12.s[w12]\n"
       ".inst 0x25306160  // psel p0.s, p8.s/Z, p11.s[w12]\n"
-      ".inst 0xe0960540  // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n"
+      ".inst 0xe0970540  // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
       "ldr x10, [x11, #0x0]\n"
-      ".inst 0xe0960364  // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n"
+      ".inst 0xe0970364  // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
       ".inst 0x25706580  // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
       ".inst 0x25706162  // psel p2.s, p8.s/Z, p11.s[w12, #1]\n"
       "ldr x27, [x9, #0x0]\n"
       ".inst 0x25307541  // psel p1.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0960321  // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n"
+      ".inst 0xe0970321  // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
       "ldr x25, [x11, #0x8]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0960ae5  // ld1w { za1h.s[x12, #1] }, p2/Z, [x23, x22, LSL #2]\n"
-      "ldr x23, [x9, #0x8]\n"
-      ".inst 0xe0bf86a8  // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0970aa5  // ld1w { za1h.s[x12, #1] }, p2/Z, [x21, x23, LSL #2]\n"
+      "ldr x21, [x9, #0x8]\n"
+      ".inst 0xe0bf86c8  // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
-      ".inst 0xe0b082ac  // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b082cc  // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
       ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
       "add x11, x11, #0x10\n"
-      ".inst 0xe0bc86a9  // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      ".inst 0xe0bc86c9  // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
       "add x9, x9, #0x10\n"
-      ".inst 0xe0ba82ad  // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n"
+      ".inst 0xe0ba82cd  // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
       "add x12, x12, #0x2\n"
       "cmp x12, x14\n"
-      "addvl x21, x21, #4\n"
+      "addvl x22, x22, #4\n"
       "blt 6b\n"
       "7:"  // K loop: Main loop: Second: Tail
       ".inst 0x25306581  // psel p1.s, p9.s/Z, p12.s[w12]\n"
       ".inst 0x25306160  // psel p0.s, p8.s/Z, p11.s[w12]\n"
-      ".inst 0xe0960540  // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n"
-      ".inst 0xe0960364  // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n"
+      ".inst 0xe0970540  // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n"
+      ".inst 0xe0970364  // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n"
       "mov x11, %x[in]\n"
       "add x9, %x[in], x16, LSL #3\n"
       "ldr x10, [x11, #0x0]\n"
       ".inst 0x25706580  // psel p0.s, p9.s/Z, p12.s[w12, #1]\n"
       ".inst 0x25706161  // psel p1.s, p8.s/Z, p11.s[w12, #1]\n"
-      ".inst 0xe0960321  // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n"
+      ".inst 0xe0970321  // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n"
       "ldr x27, [x9, #0x0]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe09606e5  // ld1w { za1h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n"
+      ".inst 0xe09706a5  // ld1w { za1h.s[x12, #1] }, p1/Z, [x21, x23, LSL #2]\n"
       "ldr x25, [x11, #0x8]\n"
       ".inst 0x25307542  // psel p2.s, p13.s/Z, p10.s[w12]\n"
-      "ldr x23, [x9, #0x8]\n"
-      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      "ldr x21, [x9, #0x8]\n"
+      ".inst 0xe0bf82c8  // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25707541  // psel p1.s, p13.s/Z, p10.s[w12, #1]\n"
       ".inst 0x25707540  // psel p0.s, p13.s/Z, p10.s[w12, #1]\n"
-      ".inst 0xe0b08aac  // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b08acc  // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n"
       "whilelt p10.s, x13, %x[width]\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xe0bc86a9  // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n"
+      ".inst 0xe0bc86c9  // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n"
       "add x11, x11, #0x10\n"
       "add x9, x9, #0x10\n"
-      ".inst 0xe0ba82ad  // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n"
-      "addvl x21, x21, #4\n"
+      ".inst 0xe0ba82cd  // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n"
+      "addvl x22, x22, #4\n"
       "incw x13\n"
-      "incw x22\n"
+      "incw x23\n"
       "bgt 3b\n"
       "8:"  // K loop: Tails
       "cbnz x24, 11f\n"
@@ -238,48 +236,48 @@
       "mov x12, #0x0\n"
       "9:"  // K loop: Tails: Even: First
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
-      "ldr x10, [x11, #0x0]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
+      "ldr x21, [x11, #0x0]\n"
       ".inst 0x25306581  // psel p1.s, p9.s/Z, p12.s[w12]\n"
       ".inst 0x25306160  // psel p0.s, p8.s/Z, p11.s[w12]\n"
-      "ldr x27, [x11, x16, LSL #0x3]\n"
-      ".inst 0xe0960548  // ld1w { za2h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n"
+      "ldr x20, [x11, x16, LSL #0x3]\n"
+      ".inst 0xe09706a8  // ld1w { za2h.s[x12] }, p1/Z, [x21, x23, LSL #2]\n"
       "add x11, x11, #0x8\n"
-      "addvl x21, x21, #2\n"
-      ".inst 0xe096036c  // ld1w { za3h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n"
+      "addvl x22, x22, #2\n"
+      ".inst 0xe097028c  // ld1w { za3h.s[x12] }, p0/Z, [x20, x23, LSL #2]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x16\n"
       "blt 9b\n"
       "whilelt p10.s, x13, %x[width]\n"
-      "whilelt p9.s, x13, %x[width]\n"
+      "whilelt p8.s, x13, %x[width]\n"
       "whilelt p8.s, x13, %x[width]\n"
       "mov x12, #0x0\n"
       "10:"  // K loop: Tails: Even: Second
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0bf82a8  // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0bf82c8  // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0b082ac  // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b082cc  // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x15\n"
-      "addvl x21, x21, #2\n"
+      "addvl x22, x22, #2\n"
       "blt 10b\n"
-      "whilelt p10.s, x13, %x[width]\n"
+      "whilelt p8.s, x13, %x[width]\n"
       "b 13f\n"
       "11:"  // K loop: Tails: Odd
       "mov x12, #0x0\n"
       "12:"  // K loop: Tails: Odd: Loop
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0bf82a0  // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n"
+      ".inst 0xe0bf82c0  // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n"
       ".inst 0x25307540  // psel p0.s, p13.s/Z, p10.s[w12]\n"
-      ".inst 0xe0b082a4  // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n"
+      ".inst 0xe0b082c4  // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x15\n"
-      "addvl x21, x21, #2\n"
+      "addvl x22, x22, #2\n"
       "blt 12b\n"
       "13:"  // K loop: End
-      "mov %x[out], x21\n"
+      "mov %x[out], x22\n"
       ".inst 0xd503467f  // SMSTOP\n"
       : [out] "+&r" (out)
       : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width)
@@ -287,4 +285,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp
index 4cc84d3..2e53475 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<4, 2, VLType::SME, false>(
   bfloat16 * &out, const bfloat16 * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntw x16\n"
@@ -124,4 +122,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp
index 465939c..67dd5a9 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<4, 4, VLType::SME, false>(
   int8_t * &out, const int8_t * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntw x16\n"
@@ -123,4 +121,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp
index ffd9384..21d9378 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<4, 4, VLType::SME, true>(
@@ -112,22 +112,22 @@
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0bf8120  // st1w { za0v.s[x12] }, p0/Z, [x9, XZR, LSL #2]\n"
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
-      ".inst 0xc0828812  // mova z18.s, p2/M, za0v.s[x12]\n"
+      ".inst 0xc0828811  // mova z17.s, p2/M, za0v.s[x12]\n"
       ".inst 0xe0af8124  // st1w { za1v.s[x12] }, p0/Z, [x9, x15, LSL #2]\n"
       ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
-      ".inst 0xc0828891  // mova z17.s, p2/M, za1v.s[x12]\n"
+      ".inst 0xc0828893  // mova z19.s, p2/M, za1v.s[x12]\n"
       ".inst 0xe0ae8528  // st1w { za2v.s[x12] }, p1/Z, [x9, x14, LSL #2]\n"
       ".inst 0xc0828910  // mova z16.s, p2/M, za2v.s[x12]\n"
-      "sdot z23.s, z18.b, z24.b\n"
+      "sdot z23.s, z17.b, z24.b\n"
       ".inst 0xe0ad812c  // st1w { za3v.s[x12] }, p0/Z, [x9, x13, LSL #2]\n"
-      ".inst 0xc0828993  // mova z19.s, p2/M, za3v.s[x12]\n"
+      ".inst 0xc0828992  // mova z18.s, p2/M, za3v.s[x12]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x20\n"
-      "sdot z22.s, z17.b, z24.b\n"
+      "sdot z22.s, z19.b, z24.b\n"
       "sdot z21.s, z16.b, z24.b\n"
       "addvl x9, x9, #4\n"
-      "sdot z20.s, z19.b, z24.b\n"
+      "sdot z20.s, z18.b, z24.b\n"
       "blt 5b\n"
       "incb x28\n"
       "whilelt p9.b, x28, %x[width]\n"
@@ -147,4 +147,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp
index 9f5db6b..f149c93 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<4, 4, VLType::SME, false>(
   uint8_t * &out, const uint8_t * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntw x16\n"
@@ -123,4 +121,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp
index 49d2acf..252152e 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<4, 4, VLType::SME, true>(
@@ -112,22 +112,22 @@
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xe0bf8120  // st1w { za0v.s[x12] }, p0/Z, [x9, XZR, LSL #2]\n"
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
-      ".inst 0xc0828813  // mova z19.s, p2/M, za0v.s[x12]\n"
+      ".inst 0xc0828810  // mova z16.s, p2/M, za0v.s[x12]\n"
       ".inst 0xe0af8124  // st1w { za1v.s[x12] }, p0/Z, [x9, x15, LSL #2]\n"
       ".inst 0x25306d21  // psel p1.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0x25306d20  // psel p0.s, p11.s/Z, p9.s[w12]\n"
       ".inst 0xc0828891  // mova z17.s, p2/M, za1v.s[x12]\n"
       ".inst 0xe0ae8528  // st1w { za2v.s[x12] }, p1/Z, [x9, x14, LSL #2]\n"
-      ".inst 0xc0828912  // mova z18.s, p2/M, za2v.s[x12]\n"
-      "udot z23.s, z19.b, z24.b\n"
+      ".inst 0xc0828913  // mova z19.s, p2/M, za2v.s[x12]\n"
+      "udot z23.s, z16.b, z24.b\n"
       ".inst 0xe0ad812c  // st1w { za3v.s[x12] }, p0/Z, [x9, x13, LSL #2]\n"
-      ".inst 0xc0828990  // mova z16.s, p2/M, za3v.s[x12]\n"
+      ".inst 0xc0828992  // mova z18.s, p2/M, za3v.s[x12]\n"
       "add x12, x12, #0x1\n"
       "cmp x12, x20\n"
       "udot z22.s, z17.b, z24.b\n"
-      "udot z21.s, z18.b, z24.b\n"
+      "udot z21.s, z19.b, z24.b\n"
       "addvl x9, x9, #4\n"
-      "udot z20.s, z16.b, z24.b\n"
+      "udot z20.s, z18.b, z24.b\n"
       "blt 5b\n"
       "incb x28\n"
       "whilelt p9.b, x28, %x[width]\n"
@@ -147,4 +147,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp
index 9579263..b11bb93 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 template <>
 void interleave_block<4, 1, VLType::SME, false>(
   float * &out, const float * const *in,
-  size_t width, size_t height, size_t row_offset, bool first
+  size_t width, size_t height, size_t row_offset, bool
 )
 {
-  ARM_COMPUTE_UNUSED(first);
-
   __asm__ __volatile__(
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntw x15\n"
@@ -123,4 +121,4 @@
     );
 }
 
-#endif  // defined(__ARM_FEATURE_SVE)
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp
index 4f25da2..b921fd1 100644
--- a/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,8 +39,12 @@
  */
 template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut>
 void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
     const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
                                                   (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+    const unsigned int int_by = height_vectors;
+#endif
 
     std::vector<int32_t> the_sums;
 
@@ -104,8 +108,12 @@
 
 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut>
 inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
     const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
                                                   (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+    const unsigned int height = height_vectors;
+#endif
 
     // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
     if (row_sum_multiplier) {
@@ -138,8 +146,12 @@
                         unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax,
                         const unsigned int k0, const unsigned int kmax, bool integrate_sums,
                         const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
     const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
                                                   (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+    const unsigned int height = height_vectors;
+#endif
 
     // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
     // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
@@ -208,8 +220,12 @@
 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
 void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen,
         const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
     const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
                                                   (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+    const unsigned int height = height_vectors;
+#endif
     auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
 
     // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
@@ -246,8 +262,12 @@
 
 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
 void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
     const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
                                                   (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+    const unsigned int height = height_vectors;
+#endif
     // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
     const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp
index 9a871d4..72e4149 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp
index 74791f8..377dadd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp
@@ -231,11 +231,11 @@
       "17:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 18f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 19f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -251,41 +251,41 @@
       "ldr q6, [x12, #0x10]\n"
       "blt 21f\n"
       "20:"  // Height 1: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x9, #0x10]\n"
-      "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x12, #0x20]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x12, #0x30]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x11, #0x20]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x11, #0x30]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x9, #0x30]\n"
+      "trn1 v20.2d, v1.2d, v21.2d\n"
+      ".inst 0x6e47ee88  // bfmmla v8.4s, v20.8h, v7.8h\n"
+      "ldr q17, [x11, #0x0]\n"
+      ".inst 0x6e46ee8c  // bfmmla v12.4s, v20.8h, v6.8h\n"
+      "ldr q19, [x11, #0x10]\n"
+      ".inst 0x6e51ee89  // bfmmla v9.4s, v20.8h, v17.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      ".inst 0x6e53ee8d  // bfmmla v13.4s, v20.8h, v19.8h\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e52ee8a  // bfmmla v10.4s, v20.8h, v18.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e51ee8e  // bfmmla v14.4s, v20.8h, v17.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      "trn2 v1.2d, v1.2d, v21.2d\n"
+      ".inst 0x6e52ee8b  // bfmmla v11.4s, v20.8h, v18.8h\n"
+      "ldr q18, [x12, #0x20]\n"
+      ".inst 0x6e51ee8f  // bfmmla v15.4s, v20.8h, v17.8h\n"
+      "ldr q17, [x12, #0x30]\n"
+      ".inst 0x6e52ec28  // bfmmla v8.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x11, #0x20]\n"
+      ".inst 0x6e51ec2c  // bfmmla v12.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x11, #0x30]\n"
+      ".inst 0x6e52ec29  // bfmmla v9.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x9, #0x20]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x9, #0x30]\n"
       "sub x27, x27, #0x8\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
       "ldr q1, [x26, #0x0]\n"
       "add x12, x12, #0x40\n"
       "ldr q7, [x12, #0x0]\n"
@@ -295,39 +295,39 @@
       "add x9, x9, #0x40\n"
       "bge 20b\n"
       "21:"  // Height 1: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x9, #0x10]\n"
-      "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x12, #0x20]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x12, #0x30]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x11, #0x20]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x11, #0x30]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x9, #0x30]\n"
+      "trn1 v19.2d, v1.2d, v20.2d\n"
+      ".inst 0x6e47ee68  // bfmmla v8.4s, v19.8h, v7.8h\n"
+      "ldr q17, [x11, #0x0]\n"
+      ".inst 0x6e46ee6c  // bfmmla v12.4s, v19.8h, v6.8h\n"
+      "ldr q18, [x11, #0x10]\n"
+      ".inst 0x6e51ee69  // bfmmla v9.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x0]\n"
+      ".inst 0x6e52ee6d  // bfmmla v13.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x10]\n"
+      ".inst 0x6e51ee6a  // bfmmla v10.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x9, #0x0]\n"
+      ".inst 0x6e52ee6e  // bfmmla v14.4s, v19.8h, v18.8h\n"
+      "ldr q24, [x9, #0x10]\n"
+      "trn2 v1.2d, v1.2d, v20.2d\n"
+      ".inst 0x6e51ee6b  // bfmmla v11.4s, v19.8h, v17.8h\n"
+      "ldr q18, [x12, #0x20]\n"
+      ".inst 0x6e58ee6f  // bfmmla v15.4s, v19.8h, v24.8h\n"
+      "ldr q17, [x12, #0x30]\n"
+      ".inst 0x6e52ec28  // bfmmla v8.4s, v1.8h, v18.8h\n"
+      "ldr q19, [x11, #0x20]\n"
+      ".inst 0x6e51ec2c  // bfmmla v12.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x11, #0x30]\n"
+      ".inst 0x6e53ec29  // bfmmla v9.4s, v1.8h, v19.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x9, #0x20]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x9, #0x30]\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
       "add x26, x26, #0x10\n"
       "add x12, x12, #0x40\n"
       "add x11, x11, #0x40\n"
@@ -338,26 +338,26 @@
       "cmp x27, #0x4\n"
       "blt 24f\n"
       "23:"  // Height 1: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr q6, [x12, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr q7, [x12, #0x10]\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x11, #0x0]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x11, #0x10]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x9, #0x10]\n"
+      "ldr d19, [x26], #0x8\n"
+      "ldr q18, [x12, #0x0]\n"
+      "trn1 v19.2d, v19.2d, v17.2d\n"
+      "ldr q17, [x12, #0x10]\n"
+      ".inst 0x6e52ee68  // bfmmla v8.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x11, #0x0]\n"
+      ".inst 0x6e51ee6c  // bfmmla v12.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x11, #0x10]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x9, #0x10]\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
       "add x12, x12, #0x20\n"
       "add x11, x11, #0x20\n"
       "add x10, x10, #0x20\n"
@@ -373,23 +373,23 @@
       "25:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr h1, [x26, #0x0]\n"
       "26:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x12, #0x0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      "ldr q20, [x12, #0x0]\n"
+      "ldr q18, [x12, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v17.2d\n"
+      ".inst 0x6e54ee68  // bfmmla v8.4s, v19.8h, v20.8h\n"
+      "ldr q17, [x11, #0x0]\n"
+      ".inst 0x6e52ee6c  // bfmmla v12.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x11, #0x10]\n"
+      ".inst 0x6e51ee69  // bfmmla v9.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x0]\n"
+      ".inst 0x6e52ee6d  // bfmmla v13.4s, v19.8h, v18.8h\n"
       "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x9, #0x10]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e51ee6a  // bfmmla v10.4s, v19.8h, v17.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e46ee6e  // bfmmla v14.4s, v19.8h, v6.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
       "add x12, x12, #0x20\n"
       "add x11, x11, #0x20\n"
       "add x10, x10, #0x20\n"
@@ -405,17 +405,17 @@
       "uzp1 v11.2d, v11.2d, v15.2d\n"
       "tbz %x[flags], #1, 28f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v18.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v18.4s\n"
+      "fmin v9.4s, v9.4s, v18.4s\n"
+      "fmin v10.4s, v10.4s, v18.4s\n"
+      "fmin v11.4s, v11.4s, v18.4s\n"
+      "fmax v8.4s, v8.4s, v17.4s\n"
+      "fmax v9.4s, v9.4s, v17.4s\n"
+      "fmax v10.4s, v10.4s, v17.4s\n"
+      "fmax v11.4s, v11.4s, v17.4s\n"
       "28:"  // Height 1: No activation
       "cmp x14, #0x10\n"
       "bge 37f\n"
@@ -624,12 +624,12 @@
       "55:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 56f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 57f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -637,7 +637,7 @@
       "b 57f\n"
       "56:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
       "57:"  // Height 2: input setup done
       "cmp x27, #0x8\n"
       "blt 60f\n"
@@ -648,45 +648,45 @@
       "ldr q6, [x12, #0x10]\n"
       "blt 59f\n"
       "58:"  // Height 2: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x9, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ee68  // bfmmla v8.4s, v19.8h, v7.8h\n"
+      "ldr q18, [x11, #0x0]\n"
+      ".inst 0x6e46ee6c  // bfmmla v12.4s, v19.8h, v6.8h\n"
+      "ldr q17, [x11, #0x10]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x9, #0x10]\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x12, #0x20]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x12, #0x30]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x11, #0x20]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x11, #0x30]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x12, #0x20]\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x12, #0x30]\n"
+      ".inst 0x6e52ec28  // bfmmla v8.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x11, #0x20]\n"
+      ".inst 0x6e51ec2c  // bfmmla v12.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x11, #0x30]\n"
+      ".inst 0x6e52ec29  // bfmmla v9.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x9, #0x20]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x9, #0x30]\n"
       "sub x27, x27, #0x8\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
       "add x12, x12, #0x40\n"
       "ldr q7, [x12, #0x0]\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
       "ldr q1, [x26, #0x0]\n"
       "ldr q6, [x12, #0x10]\n"
       "add x11, x11, #0x40\n"
@@ -694,39 +694,39 @@
       "add x9, x9, #0x40\n"
       "bge 58b\n"
       "59:"  // Height 2: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x9, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ee68  // bfmmla v8.4s, v19.8h, v7.8h\n"
+      "ldr q18, [x11, #0x0]\n"
+      ".inst 0x6e46ee6c  // bfmmla v12.4s, v19.8h, v6.8h\n"
+      "ldr q17, [x11, #0x10]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x9, #0x10]\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x12, #0x20]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x12, #0x30]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x11, #0x20]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x11, #0x30]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x12, #0x20]\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x12, #0x30]\n"
+      ".inst 0x6e52ec28  // bfmmla v8.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x11, #0x20]\n"
+      ".inst 0x6e51ec2c  // bfmmla v12.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x11, #0x30]\n"
+      ".inst 0x6e52ec29  // bfmmla v9.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x9, #0x20]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x9, #0x30]\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "add x12, x12, #0x40\n"
@@ -738,27 +738,27 @@
       "cmp x27, #0x4\n"
       "blt 62f\n"
       "61:"  // Height 2: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d18, [x26], #0x8\n"
+      "ldr d17, [x25], #0x8\n"
+      "trn1 v19.2d, v18.2d, v17.2d\n"
       "sub x27, x27, #0x4\n"
-      "ldr q6, [x12, #0x0]\n"
-      "ldr q7, [x12, #0x10]\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      "ldr q6, [x11, #0x0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
+      "ldr q18, [x12, #0x0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      ".inst 0x6e52ee68  // bfmmla v8.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6c  // bfmmla v12.4s, v19.8h, v17.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x6e5aee69  // bfmmla v9.4s, v19.8h, v26.8h\n"
+      ".inst 0x6e46ee6d  // bfmmla v13.4s, v19.8h, v6.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      "ldr q17, [x9, #0x10]\n"
       "cmp x27, #0x4\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
       "add x12, x12, #0x20\n"
       "add x11, x11, #0x20\n"
       "add x10, x10, #0x20\n"
@@ -777,23 +777,23 @@
       "ldr h1, [x26, #0x0]\n"
       "ldr h2, [x25, #0x0]\n"
       "64:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x12, #0x0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x9, #0x10]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      "ldr q18, [x12, #0x0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e52ee68  // bfmmla v8.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x11, #0x0]\n"
+      ".inst 0x6e51ee6c  // bfmmla v12.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x11, #0x10]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q3, [x10, #0x0]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q27, [x10, #0x10]\n"
+      ".inst 0x6e43ee6a  // bfmmla v10.4s, v19.8h, v3.8h\n"
+      "ldr q18, [x9, #0x0]\n"
+      ".inst 0x6e5bee6e  // bfmmla v14.4s, v19.8h, v27.8h\n"
+      "ldr q17, [x9, #0x10]\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
       "add x12, x12, #0x20\n"
       "add x11, x11, #0x20\n"
       "add x10, x10, #0x20\n"
@@ -815,25 +815,25 @@
       "uzp2 v11.2d, v11.2d, v15.2d\n"
       "tbz %x[flags], #1, 66f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v18.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v7.4s, v7.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmax v7.4s, v7.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v18.4s\n"
+      "fmin v12.4s, v12.4s, v18.4s\n"
+      "fmin v13.4s, v13.4s, v18.4s\n"
+      "fmin v14.4s, v14.4s, v18.4s\n"
+      "fmin v8.4s, v8.4s, v18.4s\n"
+      "fmin v9.4s, v9.4s, v18.4s\n"
+      "fmin v10.4s, v10.4s, v18.4s\n"
+      "fmin v11.4s, v11.4s, v18.4s\n"
+      "fmax v7.4s, v7.4s, v17.4s\n"
+      "fmax v12.4s, v12.4s, v17.4s\n"
+      "fmax v13.4s, v13.4s, v17.4s\n"
+      "fmax v14.4s, v14.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v17.4s\n"
+      "fmax v9.4s, v9.4s, v17.4s\n"
+      "fmax v10.4s, v10.4s, v17.4s\n"
+      "fmax v11.4s, v11.4s, v17.4s\n"
       "66:"  // Height 2: No activation
       "cmp x14, #0x10\n"
       "bge 75f\n"
@@ -1107,13 +1107,13 @@
       "93:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 94f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 95f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1122,8 +1122,8 @@
       "b 95f\n"
       "94:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "95:"  // Height 3: input setup done
       "cmp x27, #0x8\n"
       "blt 98f\n"
@@ -1135,170 +1135,170 @@
       "ldr q6, [x12, #0x10]\n"
       "blt 97f\n"
       "96:"  // Height 3: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
       "cmp x27, #0x10\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x9, #0x10]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x12, #0x20]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x12, #0x20]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x12, #0x30]\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x12, #0x30]\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x11, #0x20]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x11, #0x20]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
       "add x12, x12, #0x40\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x11, #0x30]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x11, #0x30]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
       "add x11, x11, #0x40\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x9, #0x20]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
       "ldr q7, [x12, #0x0]\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
       "ldr q3, [x24, #0x0]\n"
       "ldr q6, [x12, #0x10]\n"
       "bge 96b\n"
       "97:"  // Height 3: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x9, #0x10]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x12, #0x20]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x12, #0x30]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x12, #0x20]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x12, #0x30]\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
       "add x12, x12, #0x40\n"
-      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x11, #0x20]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x11, #0x30]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x11, #0x20]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x11, #0x30]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
       "add x11, x11, #0x40\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x9, #0x20]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
       "98:"  // Height 3: Multiply loop: Main loop skip
       "cbz x27, 103f\n"
       "cmp x27, #0x4\n"
       "blt 100f\n"
       "99:"  // Height 3: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr q6, [x12, #0x0]\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ldr q7, [x12, #0x10]\n"
-      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x11, #0x0]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x11, #0x10]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr q26, [x12, #0x0]\n"
+      "trn1 v27.2d, v25.2d, v27.2d\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      "ldr q25, [x12, #0x10]\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e59ef8c  // bfmmla v12.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef74  // bfmmla v20.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
       "sub x27, x27, #0x4\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
       "cmp x27, #0x4\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
       "add x12, x12, #0x20\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
       "add x11, x11, #0x20\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
       "add x10, x10, #0x20\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
       "add x9, x9, #0x20\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
       "bge 99b\n"
       "100:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x27, 103f\n"
@@ -1316,36 +1316,36 @@
       "ldr h2, [x25, #0x0]\n"
       "ldr h3, [x24, #0x0]\n"
       "102:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x12, #0x0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q29, [x12, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v25.2d\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e5def8c  // bfmmla v12.4s, v28.8h, v29.8h\n"
+      ".inst 0x6e5def74  // bfmmla v20.4s, v27.8h, v29.8h\n"
+      "ldr q25, [x11, #0x10]\n"
       "add x12, x12, #0x20\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x0]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
       "add x11, x11, #0x20\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
       "add x10, x10, #0x20\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
       "add x9, x9, #0x20\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
       "103:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1368,33 +1368,33 @@
       "uzp1 v19.2d, v19.2d, v23.2d\n"
       "tbz %x[flags], #1, 104f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v7.4s, v7.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmax v7.4s, v7.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmax v7.4s, v7.4s, v25.4s\n"
+      "fmax v12.4s, v12.4s, v25.4s\n"
+      "fmax v13.4s, v13.4s, v25.4s\n"
+      "fmax v14.4s, v14.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v25.4s\n"
+      "fmax v9.4s, v9.4s, v25.4s\n"
+      "fmax v10.4s, v10.4s, v25.4s\n"
+      "fmax v11.4s, v11.4s, v25.4s\n"
+      "fmax v16.4s, v16.4s, v25.4s\n"
+      "fmax v17.4s, v17.4s, v25.4s\n"
+      "fmax v18.4s, v18.4s, v25.4s\n"
+      "fmax v19.4s, v19.4s, v25.4s\n"
       "104:"  // Height 3: No activation
       "cmp x14, #0x10\n"
       "bge 113f\n"
@@ -1709,14 +1709,14 @@
       "131:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 132f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 133f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1726,9 +1726,9 @@
       "b 133f\n"
       "132:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "133:"  // Height 4: input setup done
       "cmp x27, #0x8\n"
       "blt 136f\n"
@@ -1741,174 +1741,174 @@
       "ldr q6, [x12, #0x10]\n"
       "blt 135f\n"
       "134:"  // Height 4: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
       "sub x27, x27, #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
       "cmp x27, #0x10\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x9, #0x10]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x12, #0x20]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x12, #0x20]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
       "add x23, x23, #0x10\n"
       "ldr q4, [x23, #0x0]\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x12, #0x30]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x12, #0x30]\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x11, #0x20]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x11, #0x30]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x11, #0x20]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x11, #0x30]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
       "add x12, x12, #0x40\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
       "add x11, x11, #0x40\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x9, #0x20]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
       "ldr q7, [x12, #0x0]\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
       "ldr q3, [x24, #0x0]\n"
       "ldr q6, [x12, #0x10]\n"
       "bge 134b\n"
       "135:"  // Height 4: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
       "sub x27, x27, #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x9, #0x10]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x12, #0x20]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x12, #0x30]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x12, #0x20]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x12, #0x30]\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
       "add x12, x12, #0x40\n"
-      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x11, #0x20]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x11, #0x30]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x11, #0x20]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x11, #0x30]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
       "add x11, x11, #0x40\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x9, #0x20]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
       "136:"  // Height 4: Multiply loop: Main loop skip
       "cbz x27, 141f\n"
       "cmp x27, #0x4\n"
       "blt 138f\n"
       "137:"  // Height 4: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
       "sub x27, x27, #0x4\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d26, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "trn1 v27.2d, v26.2d, v25.2d\n"
       "cmp x27, #0x4\n"
-      "ldr q6, [x12, #0x0]\n"
-      "ldr q7, [x12, #0x10]\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x11, #0x0]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x11, #0x10]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q25, [x12, #0x10]\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x11, #0x0]\n"
+      ".inst 0x6e59ef8c  // bfmmla v12.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef74  // bfmmla v20.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x11, #0x10]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
       "add x12, x12, #0x20\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
       "add x11, x11, #0x20\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x9, #0x0]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
       "add x10, x10, #0x20\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x9, #0x10]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
       "add x9, x9, #0x20\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
       "bge 137b\n"
       "138:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x27, 141f\n"
@@ -1929,36 +1929,36 @@
       "ldr h3, [x24, #0x0]\n"
       "ldr h4, [x23, #0x0]\n"
       "140:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x12, #0x0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x11, #0x0]\n"
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q25, [x12, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x11, #0x0]\n"
       "add x12, x12, #0x20\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x6e59ef8c  // bfmmla v12.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef74  // bfmmla v20.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x11, #0x10]\n"
       "add x11, x11, #0x20\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x0]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x10]\n"
       "add x10, x10, #0x20\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x9, #0x0]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x9, #0x10]\n"
       "add x9, x9, #0x20\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
       "141:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1986,41 +1986,41 @@
       "uzp2 v19.2d, v19.2d, v23.2d\n"
       "tbz %x[flags], #1, 142f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v7.4s, v7.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v1.4s\n"
-      "fmin v22.4s, v22.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmax v7.4s, v7.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v20.4s, v20.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v0.4s\n"
-      "fmax v22.4s, v22.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v15.4s, v15.4s, v26.4s\n"
+      "fmin v20.4s, v20.4s, v26.4s\n"
+      "fmin v21.4s, v21.4s, v26.4s\n"
+      "fmin v22.4s, v22.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmax v7.4s, v7.4s, v25.4s\n"
+      "fmax v12.4s, v12.4s, v25.4s\n"
+      "fmax v13.4s, v13.4s, v25.4s\n"
+      "fmax v14.4s, v14.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v25.4s\n"
+      "fmax v9.4s, v9.4s, v25.4s\n"
+      "fmax v10.4s, v10.4s, v25.4s\n"
+      "fmax v11.4s, v11.4s, v25.4s\n"
+      "fmax v15.4s, v15.4s, v25.4s\n"
+      "fmax v20.4s, v20.4s, v25.4s\n"
+      "fmax v21.4s, v21.4s, v25.4s\n"
+      "fmax v22.4s, v22.4s, v25.4s\n"
+      "fmax v16.4s, v16.4s, v25.4s\n"
+      "fmax v17.4s, v17.4s, v25.4s\n"
+      "fmax v18.4s, v18.4s, v25.4s\n"
+      "fmax v19.4s, v19.4s, v25.4s\n"
       "142:"  // Height 4: No activation
       "cmp x14, #0x10\n"
       "bge 151f\n"
@@ -2400,15 +2400,15 @@
       "169:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 170f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 171f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -2419,10 +2419,10 @@
       "b 171f\n"
       "170:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "171:"  // Height 5: input setup done
       "cmp x27, #0x8\n"
       "blt 174f\n"
@@ -2435,170 +2435,170 @@
       "ldr q7, [x12, #0x0]\n"
       "blt 173f\n"
       "172:"  // Height 5: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v6.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ecc8  // bfmmla v8.4s, v6.8h, v7.8h\n"
       "trn1 v2.2d, v3.2d, v4.2d\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
       ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
       "sub x27, x27, #0x8\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "trn2 v5.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x12, #0x10]\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x12, #0x10]\n"
       ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
       "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40eccc  // bfmmla v12.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec54  // bfmmla v20.4s, v2.8h, v0.8h\n"
       "cmp x27, #0x10\n"
-      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40ec9c  // bfmmla v28.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x11, #0x10]\n"
+      ".inst 0x6e47ecc9  // bfmmla v9.4s, v6.8h, v7.8h\n"
       "add x26, x26, #0x10\n"
       ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
       ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
       "ldr q7, [x10, #0x0]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40eccd  // bfmmla v13.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec55  // bfmmla v21.4s, v2.8h, v0.8h\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40ec9d  // bfmmla v29.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e47ecca  // bfmmla v10.4s, v6.8h, v7.8h\n"
       "add x22, x22, #0x10\n"
       ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
       ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
       "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x9, #0x10]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40ecce  // bfmmla v14.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec56  // bfmmla v22.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e40ec9e  // bfmmla v30.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x6e47eccb  // bfmmla v11.4s, v6.8h, v7.8h\n"
       ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
       ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
       "ldr q7, [x12, #0x20]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40eccf  // bfmmla v15.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec57  // bfmmla v23.4s, v2.8h, v0.8h\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x12, #0x30]\n"
+      ".inst 0x6e40ec9f  // bfmmla v31.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x12, #0x30]\n"
       "ldr q4, [x23, #0x0]\n"
       ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
       ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
       ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x11, #0x20]\n"
+      "ldr q6, [x11, #0x20]\n"
       "add x12, x12, #0x40\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbc  // bfmmla v28.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x11, #0x30]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e40ec2c  // bfmmla v12.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbc  // bfmmla v28.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x11, #0x30]\n"
+      ".inst 0x6e46ec29  // bfmmla v9.4s, v1.8h, v6.8h\n"
       "add x11, x11, #0x40\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecb9  // bfmmla v25.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbd  // bfmmla v29.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e46ec71  // bfmmla v17.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecb9  // bfmmla v25.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e40ec2d  // bfmmla v13.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbd  // bfmmla v29.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e46ec2a  // bfmmla v10.4s, v1.8h, v6.8h\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecba  // bfmmla v26.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbe  // bfmmla v30.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e46ec72  // bfmmla v18.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecba  // bfmmla v26.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x6e40ec2e  // bfmmla v14.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbe  // bfmmla v30.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecbb  // bfmmla v27.4s, v5.8h, v7.8h\n"
+      ".inst 0x6e46ec2b  // bfmmla v11.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbb  // bfmmla v27.4s, v5.8h, v6.8h\n"
       "ldr q7, [x12, #0x0]\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e40ec2f  // bfmmla v15.4s, v1.8h, v0.8h\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
       "ldr q3, [x24, #0x0]\n"
-      ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
+      ".inst 0x6e40ecbf  // bfmmla v31.4s, v5.8h, v0.8h\n"
       "ldr q5, [x22, #0x0]\n"
       "bge 172b\n"
       "173:"  // Height 5: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v6.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ecc8  // bfmmla v8.4s, v6.8h, v7.8h\n"
       "trn1 v2.2d, v3.2d, v4.2d\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
       ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
       "sub x27, x27, #0x8\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "trn2 v5.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x12, #0x10]\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x12, #0x10]\n"
       ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
       "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40eccc  // bfmmla v12.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec54  // bfmmla v20.4s, v2.8h, v0.8h\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40ec9c  // bfmmla v28.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x11, #0x10]\n"
+      ".inst 0x6e47ecc9  // bfmmla v9.4s, v6.8h, v7.8h\n"
       "add x25, x25, #0x10\n"
       ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
       ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
       "ldr q7, [x10, #0x0]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40eccd  // bfmmla v13.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec55  // bfmmla v21.4s, v2.8h, v0.8h\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40ec9d  // bfmmla v29.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e47ecca  // bfmmla v10.4s, v6.8h, v7.8h\n"
       ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
       ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
       "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x9, #0x10]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40ecce  // bfmmla v14.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec56  // bfmmla v22.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e40ec9e  // bfmmla v30.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x6e47eccb  // bfmmla v11.4s, v6.8h, v7.8h\n"
       ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
       ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
       "ldr q7, [x12, #0x20]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x12, #0x30]\n"
+      ".inst 0x6e40eccf  // bfmmla v15.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec57  // bfmmla v23.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e40ec9f  // bfmmla v31.4s, v4.8h, v0.8h\n"
+      "ldr q2, [x12, #0x30]\n"
       ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
       "add x12, x12, #0x40\n"
       ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
       ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x11, #0x20]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbc  // bfmmla v28.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x11, #0x30]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      "ldr q0, [x11, #0x20]\n"
+      ".inst 0x6e42ec2c  // bfmmla v12.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec74  // bfmmla v20.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbc  // bfmmla v28.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x11, #0x30]\n"
+      ".inst 0x6e40ec29  // bfmmla v9.4s, v1.8h, v0.8h\n"
       "add x11, x11, #0x40\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecb9  // bfmmla v25.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbd  // bfmmla v29.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecb9  // bfmmla v25.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x6e42ec2d  // bfmmla v13.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec75  // bfmmla v21.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbd  // bfmmla v29.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x10, #0x30]\n"
+      ".inst 0x6e40ec2a  // bfmmla v10.4s, v1.8h, v0.8h\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecba  // bfmmla v26.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbe  // bfmmla v30.4s, v5.8h, v6.8h\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecba  // bfmmla v26.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x9, #0x20]\n"
+      ".inst 0x6e42ec2e  // bfmmla v14.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec76  // bfmmla v22.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbe  // bfmmla v30.4s, v5.8h, v2.8h\n"
       "ldr q6, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecbb  // bfmmla v27.4s, v5.8h, v7.8h\n"
+      ".inst 0x6e40ec2b  // bfmmla v11.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbb  // bfmmla v27.4s, v5.8h, v0.8h\n"
       ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
       ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
       ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
@@ -2608,51 +2608,51 @@
       "blt 176f\n"
       "175:"  // Height 5: Multiply loop: Odd block loop
       "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
       "sub x27, x27, #0x4\n"
-      "ldr d5, [x22], #0x8\n"
-      "ldr q6, [x12, #0x0]\n"
-      "trn1 v4.2d, v5.2d, v7.2d\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ldr q7, [x12, #0x10]\n"
-      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x11, #0x0]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr d0, [x22], #0x8\n"
+      "ldr q1, [x12, #0x0]\n"
+      "trn1 v2.2d, v0.2d, v2.2d\n"
+      ".inst 0x6e41ec88  // bfmmla v8.4s, v4.8h, v1.8h\n"
+      "ldr q0, [x12, #0x10]\n"
+      ".inst 0x6e41ec70  // bfmmla v16.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec58  // bfmmla v24.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x11, #0x0]\n"
+      ".inst 0x6e40ec8c  // bfmmla v12.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
       "cmp x27, #0x4\n"
       "add x12, x12, #0x20\n"
-      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x11, #0x10]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e40ec5c  // bfmmla v28.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x11, #0x10]\n"
+      ".inst 0x6e41ec89  // bfmmla v9.4s, v4.8h, v1.8h\n"
       "add x11, x11, #0x20\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e41ec71  // bfmmla v17.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec59  // bfmmla v25.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x0]\n"
+      ".inst 0x6e40ec8d  // bfmmla v13.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5d  // bfmmla v29.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e41ec8a  // bfmmla v10.4s, v4.8h, v1.8h\n"
       "add x10, x10, #0x20\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e41ec72  // bfmmla v18.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5a  // bfmmla v26.4s, v2.8h, v1.8h\n"
       "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e40ec8e  // bfmmla v14.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5e  // bfmmla v30.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x6e46ec8b  // bfmmla v11.4s, v4.8h, v6.8h\n"
       "add x9, x9, #0x20\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5b  // bfmmla v27.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40ec8f  // bfmmla v15.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5f  // bfmmla v31.4s, v2.8h, v0.8h\n"
       "bge 175b\n"
       "176:"  // Height 5: Multiply loop: Skip odd blocks
       "cbz x27, 179f\n"
@@ -2676,45 +2676,45 @@
       "ldr h4, [x23, #0x0]\n"
       "ldr h5, [x22, #0x0]\n"
       "178:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x12, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x12, #0x10]\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x12, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      "trn1 v2.2d, v5.2d, v0.2d\n"
+      "ldr q1, [x12, #0x10]\n"
+      ".inst 0x6e46ece8  // bfmmla v8.4s, v7.8h, v6.8h\n"
+      ".inst 0x6e46ec70  // bfmmla v16.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec58  // bfmmla v24.4s, v2.8h, v6.8h\n"
+      "ldr q0, [x11, #0x0]\n"
+      ".inst 0x6e41ecec  // bfmmla v12.4s, v7.8h, v1.8h\n"
       "add x12, x12, #0x20\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x6e41ec74  // bfmmla v20.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5c  // bfmmla v28.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x11, #0x10]\n"
       "add x11, x11, #0x20\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e40ece9  // bfmmla v9.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec59  // bfmmla v25.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x0]\n"
+      ".inst 0x6e41eced  // bfmmla v13.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec75  // bfmmla v21.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x10]\n"
       "add x10, x10, #0x20\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e40ecea  // bfmmla v10.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5a  // bfmmla v26.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x9, #0x0]\n"
+      ".inst 0x6e41ecee  // bfmmla v14.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec76  // bfmmla v22.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5e  // bfmmla v30.4s, v2.8h, v1.8h\n"
       "ldr q6, [x9, #0x10]\n"
       "add x9, x9, #0x20\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e40eceb  // bfmmla v11.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5b  // bfmmla v27.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e46ecef  // bfmmla v15.4s, v7.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5f  // bfmmla v31.4s, v2.8h, v6.8h\n"
       "179:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -3213,16 +3213,16 @@
       "207:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 208f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 209f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -3234,11 +3234,11 @@
       "b 209f\n"
       "208:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "209:"  // Height 6: input setup done
       "cmp x27, #0x8\n"
       "blt 212f\n"
@@ -3299,45 +3299,45 @@
       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
       "ldr q2, [x25, #0x0]\n"
       ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x12, #0x30]\n"
+      "ldr q0, [x12, #0x30]\n"
       "ldr q4, [x23, #0x0]\n"
       ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
       ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
       ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x11, #0x20]\n"
+      "ldr q6, [x11, #0x20]\n"
       "add x12, x12, #0x40\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbc  // bfmmla v28.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x11, #0x30]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e40ec2c  // bfmmla v12.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbc  // bfmmla v28.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x11, #0x30]\n"
+      ".inst 0x6e46ec29  // bfmmla v9.4s, v1.8h, v6.8h\n"
       "add x11, x11, #0x40\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecb9  // bfmmla v25.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbd  // bfmmla v29.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e46ec71  // bfmmla v17.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecb9  // bfmmla v25.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e40ec2d  // bfmmla v13.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbd  // bfmmla v29.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e46ec2a  // bfmmla v10.4s, v1.8h, v6.8h\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecba  // bfmmla v26.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbe  // bfmmla v30.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e46ec72  // bfmmla v18.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecba  // bfmmla v26.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x6e40ec2e  // bfmmla v14.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbe  // bfmmla v30.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecbb  // bfmmla v27.4s, v5.8h, v7.8h\n"
+      ".inst 0x6e46ec2b  // bfmmla v11.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbb  // bfmmla v27.4s, v5.8h, v6.8h\n"
       "ldr q7, [x12, #0x0]\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e40ec2f  // bfmmla v15.4s, v1.8h, v0.8h\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
       "ldr q3, [x24, #0x0]\n"
-      ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
+      ".inst 0x6e40ecbf  // bfmmla v31.4s, v5.8h, v0.8h\n"
       "ldr q5, [x22, #0x0]\n"
       "ldr q6, [x21, #0x0]\n"
       "bge 210b\n"
@@ -3387,38 +3387,38 @@
       ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
       ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x12, #0x30]\n"
+      "ldr q2, [x12, #0x30]\n"
       ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
       "add x12, x12, #0x40\n"
       ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
       ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x11, #0x20]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbc  // bfmmla v28.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x11, #0x30]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      "ldr q0, [x11, #0x20]\n"
+      ".inst 0x6e42ec2c  // bfmmla v12.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec74  // bfmmla v20.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbc  // bfmmla v28.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x11, #0x30]\n"
+      ".inst 0x6e40ec29  // bfmmla v9.4s, v1.8h, v0.8h\n"
       "add x11, x11, #0x40\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecb9  // bfmmla v25.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbd  // bfmmla v29.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecb9  // bfmmla v25.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x6e42ec2d  // bfmmla v13.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec75  // bfmmla v21.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbd  // bfmmla v29.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x10, #0x30]\n"
+      ".inst 0x6e40ec2a  // bfmmla v10.4s, v1.8h, v0.8h\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecba  // bfmmla v26.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbe  // bfmmla v30.4s, v5.8h, v6.8h\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecba  // bfmmla v26.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x9, #0x20]\n"
+      ".inst 0x6e42ec2e  // bfmmla v14.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec76  // bfmmla v22.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbe  // bfmmla v30.4s, v5.8h, v2.8h\n"
       "ldr q6, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecbb  // bfmmla v27.4s, v5.8h, v7.8h\n"
+      ".inst 0x6e40ec2b  // bfmmla v11.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbb  // bfmmla v27.4s, v5.8h, v0.8h\n"
       ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
       ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
       ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
@@ -3428,52 +3428,52 @@
       "blt 214f\n"
       "213:"  // Height 6: Multiply loop: Odd block loop
       "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
       "sub x27, x27, #0x4\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
       "cmp x27, #0x4\n"
-      "ldr d5, [x22], #0x8\n"
-      "ldr d7, [x21], #0x8\n"
-      "trn1 v4.2d, v5.2d, v7.2d\n"
-      "ldr q6, [x12, #0x0]\n"
-      "ldr q7, [x12, #0x10]\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x11, #0x0]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr d1, [x22], #0x8\n"
+      "ldr d0, [x21], #0x8\n"
+      "trn1 v2.2d, v1.2d, v0.2d\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q0, [x12, #0x10]\n"
+      ".inst 0x6e41ec88  // bfmmla v8.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec70  // bfmmla v16.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec58  // bfmmla v24.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x11, #0x0]\n"
+      ".inst 0x6e40ec8c  // bfmmla v12.4s, v4.8h, v0.8h\n"
       "add x12, x12, #0x20\n"
-      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x11, #0x10]\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5c  // bfmmla v28.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x11, #0x10]\n"
       "add x11, x11, #0x20\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x6e41ec89  // bfmmla v9.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec71  // bfmmla v17.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec59  // bfmmla v25.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x0]\n"
+      ".inst 0x6e40ec8d  // bfmmla v13.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5d  // bfmmla v29.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x10]\n"
       "add x10, x10, #0x20\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e41ec8a  // bfmmla v10.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec72  // bfmmla v18.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5a  // bfmmla v26.4s, v2.8h, v1.8h\n"
       "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x9, #0x10]\n"
+      ".inst 0x6e40ec8e  // bfmmla v14.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5e  // bfmmla v30.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x9, #0x10]\n"
       "add x9, x9, #0x20\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e46ec8b  // bfmmla v11.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5b  // bfmmla v27.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40ec8f  // bfmmla v15.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5f  // bfmmla v31.4s, v2.8h, v0.8h\n"
       "bge 213b\n"
       "214:"  // Height 6: Multiply loop: Skip odd blocks
       "cbz x27, 217f\n"
@@ -3500,45 +3500,45 @@
       "ldr h5, [x22, #0x0]\n"
       "ldr h6, [x21, #0x0]\n"
       "216:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x12, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x12, #0x10]\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x11, #0x0]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "ldr q0, [x12, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e40ece8  // bfmmla v8.4s, v7.8h, v0.8h\n"
+      "trn1 v2.2d, v5.2d, v6.2d\n"
+      "ldr q1, [x12, #0x10]\n"
+      ".inst 0x6e40ec70  // bfmmla v16.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec58  // bfmmla v24.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x11, #0x0]\n"
+      ".inst 0x6e41ecec  // bfmmla v12.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec74  // bfmmla v20.4s, v3.8h, v1.8h\n"
       "add x12, x12, #0x20\n"
-      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x11, #0x10]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e41ec5c  // bfmmla v28.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x11, #0x10]\n"
+      ".inst 0x6e40ece9  // bfmmla v9.4s, v7.8h, v0.8h\n"
       "add x11, x11, #0x20\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec59  // bfmmla v25.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x0]\n"
+      ".inst 0x6e41eced  // bfmmla v13.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec75  // bfmmla v21.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x10]\n"
+      ".inst 0x6e40ecea  // bfmmla v10.4s, v7.8h, v0.8h\n"
       "add x10, x10, #0x20\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x9, #0x0]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5a  // bfmmla v26.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x9, #0x0]\n"
+      ".inst 0x6e41ecee  // bfmmla v14.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec76  // bfmmla v22.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5e  // bfmmla v30.4s, v2.8h, v1.8h\n"
       "ldr q6, [x9, #0x10]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40eceb  // bfmmla v11.4s, v7.8h, v0.8h\n"
       "add x9, x9, #0x20\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5b  // bfmmla v27.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e46ecef  // bfmmla v15.4s, v7.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5f  // bfmmla v31.4s, v2.8h, v6.8h\n"
       "217:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp
index f7506e5..4924b3a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp
index 18a2db5..8038612 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp
@@ -265,11 +265,11 @@
       "24:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 25f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 26f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -286,69 +286,69 @@
       "blt 28f\n"
       "27:"  // Height 1: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x0]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x11, #0x30]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x12, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x11, #0x40]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x9, #0x40]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x12, #0x50]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x11, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x10, #0x50]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x9, #0x50]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x12, #0x60]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x11, #0x60]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x9, #0x60]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x12, #0x70]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x11, #0x70]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "ldr q6, [x10, #0x70]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x11, #0x10]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x12, #0x30]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x11, #0x30]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0x30]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x12, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x11, #0x40]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x9, #0x40]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x12, #0x50]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x11, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x10, #0x50]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x9, #0x50]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x12, #0x60]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x11, #0x60]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x9, #0x60]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x12, #0x70]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x11, #0x70]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "ldr q17, [x10, #0x70]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "ldr q16, [x9, #0x70]\n"
       "sub x27, x27, #0x8\n"
       "cmp x27, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
       "add x26, x26, #0x10\n"
       "ldr q0, [x26, #0x0]\n"
       "add x12, x12, #0x80\n"
@@ -360,68 +360,68 @@
       "bge 27b\n"
       "28:"  // Height 1: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x0]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x11, #0x30]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x12, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x11, #0x40]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x9, #0x40]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x12, #0x50]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x11, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x10, #0x50]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x9, #0x50]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x12, #0x60]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x11, #0x60]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x9, #0x60]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x12, #0x70]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x11, #0x70]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "ldr q6, [x10, #0x70]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x11, #0x10]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x12, #0x30]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x11, #0x30]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0x30]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x12, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x11, #0x40]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x9, #0x40]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x12, #0x50]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x11, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x10, #0x50]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x9, #0x50]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x12, #0x60]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x11, #0x60]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x9, #0x60]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x12, #0x70]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x11, #0x70]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "ldr q17, [x10, #0x70]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "ldr q16, [x9, #0x70]\n"
       "sub x27, x27, #0x8\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
       "add x26, x26, #0x10\n"
       "add x12, x12, #0x80\n"
       "add x11, x11, #0x80\n"
@@ -431,15 +431,15 @@
       "cbz x27, 31f\n"
       "30:"  // Height 1: Multiply loop: Odd block loop
       "ldr h0, [x26], #0x2\n"
-      "ldr q6, [x12, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v8.8h, v16.8h, v0.h[0]\n"
       "sub x27, x27, #0x1\n"
-      "ldr q7, [x11, #0x0]\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "ldr q17, [x11, #0x0]\n"
+      "ldr q16, [x10, #0x0]\n"
+      "fmla v9.8h, v17.8h, v0.h[0]\n"
+      "fmla v10.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
       "add x12, x12, #0x10\n"
       "add x11, x11, #0x10\n"
       "add x10, x10, #0x10\n"
@@ -452,17 +452,17 @@
       "bne 24b\n"
       "tbz %x[flags], #1, 32f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
+      "ld1r { v17.8h }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v1.8h\n"
-      "fmin v9.8h, v9.8h, v1.8h\n"
-      "fmin v10.8h, v10.8h, v1.8h\n"
-      "fmin v11.8h, v11.8h, v1.8h\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v17.8h\n"
+      "fmin v9.8h, v9.8h, v17.8h\n"
+      "fmin v10.8h, v10.8h, v17.8h\n"
+      "fmin v11.8h, v11.8h, v17.8h\n"
+      "fmax v8.8h, v8.8h, v16.8h\n"
+      "fmax v9.8h, v9.8h, v16.8h\n"
+      "fmax v10.8h, v10.8h, v16.8h\n"
+      "fmax v11.8h, v11.8h, v16.8h\n"
       "32:"  // Height 1: No activation
       "cmp x14, #0x20\n"
       "bge 49f\n"
@@ -778,12 +778,12 @@
       "74:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 75f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 76f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -791,7 +791,7 @@
       "b 76f\n"
       "75:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
       "76:"  // Height 2: input setup done
       "cmp x27, #0x8\n"
       "blt 79f\n"
@@ -804,233 +804,233 @@
       "77:"  // Height 2: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x0]\n"
       "sub x27, x27, #0x8\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "ldr q16, [x9, #0x0]\n"
       "cmp x27, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x12, #0x10]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "fmla v14.8h, v17.8h, v1.h[0]\n"
+      "ldr q17, [x12, #0x10]\n"
       "add x26, x26, #0x10\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x11, #0x10]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "fmla v15.8h, v16.8h, v1.h[0]\n"
+      "ldr q16, [x11, #0x10]\n"
       "add x25, x25, #0x10\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x11, #0x30]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x12, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x11, #0x40]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x9, #0x40]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x12, #0x50]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x11, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x10, #0x50]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x9, #0x50]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x12, #0x60]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x11, #0x60]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x9, #0x60]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x12, #0x70]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "fmla v12.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "fmla v13.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "fmla v14.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "fmla v15.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "fmla v12.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "fmla v13.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "fmla v14.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x12, #0x30]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "fmla v15.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x11, #0x30]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "fmla v12.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0x30]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "fmla v13.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "fmla v14.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x12, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "fmla v15.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x11, #0x40]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "fmla v12.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "fmla v13.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x9, #0x40]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "fmla v14.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x12, #0x50]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "fmla v15.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x11, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "fmla v12.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x10, #0x50]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "fmla v13.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x9, #0x50]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "fmla v14.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x12, #0x60]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "fmla v15.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x11, #0x60]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "fmla v12.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "fmla v13.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x9, #0x60]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "fmla v14.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x12, #0x70]\n"
       "add x12, x12, #0x80\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x11, #0x70]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "fmla v15.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x11, #0x70]\n"
       "add x11, x11, #0x80\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "ldr q6, [x10, #0x70]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "fmla v12.8h, v17.8h, v1.h[7]\n"
+      "ldr q17, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "fmla v13.8h, v16.8h, v1.h[7]\n"
+      "ldr q16, [x9, #0x70]\n"
       "add x9, x9, #0x80\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v14.8h, v17.8h, v1.h[7]\n"
       "ldr q6, [x12, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v15.8h, v16.8h, v1.h[7]\n"
       "ldr q1, [x25, #0x0]\n"
       "ldr q7, [x11, #0x0]\n"
       "bge 77b\n"
       "78:"  // Height 2: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x0]\n"
       "sub x27, x27, #0x8\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "ldr q16, [x9, #0x0]\n"
       "add x26, x26, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x12, #0x10]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "fmla v14.8h, v17.8h, v1.h[0]\n"
+      "ldr q17, [x12, #0x10]\n"
       "add x25, x25, #0x10\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x11, #0x30]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x12, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x11, #0x40]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x9, #0x40]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x12, #0x50]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x11, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x10, #0x50]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x9, #0x50]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x12, #0x60]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x11, #0x60]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x9, #0x60]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x12, #0x70]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "fmla v15.8h, v16.8h, v1.h[0]\n"
+      "ldr q16, [x11, #0x10]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "fmla v12.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "fmla v13.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "fmla v14.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "fmla v15.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "fmla v12.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "fmla v13.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "fmla v14.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x12, #0x30]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "fmla v15.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x11, #0x30]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "fmla v12.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0x30]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "fmla v13.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x9, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "fmla v14.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x12, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "fmla v15.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x11, #0x40]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "fmla v12.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "fmla v13.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x9, #0x40]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "fmla v14.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x12, #0x50]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "fmla v15.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x11, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "fmla v12.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x10, #0x50]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "fmla v13.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x9, #0x50]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "fmla v14.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x12, #0x60]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "fmla v15.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x11, #0x60]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "fmla v12.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "fmla v13.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x9, #0x60]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "fmla v14.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x12, #0x70]\n"
       "add x12, x12, #0x80\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x11, #0x70]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "fmla v15.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x11, #0x70]\n"
       "add x11, x11, #0x80\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "ldr q6, [x10, #0x70]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "fmla v12.8h, v17.8h, v1.h[7]\n"
+      "ldr q17, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "fmla v13.8h, v16.8h, v1.h[7]\n"
+      "ldr q16, [x9, #0x70]\n"
       "add x9, x9, #0x80\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v14.8h, v17.8h, v1.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
+      "fmla v15.8h, v16.8h, v1.h[7]\n"
       "79:"  // Height 2: Multiply loop: Main loop skip
       "cbz x27, 81f\n"
       "80:"  // Height 2: Multiply loop: Odd block loop
-      "ldr h0, [x26], #0x2\n"
-      "ldr h1, [x25], #0x2\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h0, [x25], #0x2\n"
       "sub x27, x27, #0x1\n"
-      "ldr q6, [x12, #0x0]\n"
-      "ldr q7, [x11, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "ldr q17, [x12, #0x0]\n"
+      "ldr q16, [x11, #0x0]\n"
+      "fmla v8.8h, v17.8h, v1.h[0]\n"
+      "fmla v12.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "fmla v9.8h, v16.8h, v1.h[0]\n"
+      "fmla v13.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v10.8h, v17.8h, v1.h[0]\n"
+      "fmla v14.8h, v17.8h, v0.h[0]\n"
       "add x12, x12, #0x10\n"
       "add x11, x11, #0x10\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v11.8h, v16.8h, v1.h[0]\n"
+      "fmla v15.8h, v16.8h, v0.h[0]\n"
       "add x10, x10, #0x10\n"
       "add x9, x9, #0x10\n"
       "cbnz x27, 80b\n"
@@ -1043,25 +1043,25 @@
       "add x25, x13, x20, LSL #1\n"
       "tbz %x[flags], #1, 82f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
+      "ld1r { v17.8h }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v1.8h\n"
-      "fmin v9.8h, v9.8h, v1.8h\n"
-      "fmin v10.8h, v10.8h, v1.8h\n"
-      "fmin v11.8h, v11.8h, v1.8h\n"
-      "fmin v12.8h, v12.8h, v1.8h\n"
-      "fmin v13.8h, v13.8h, v1.8h\n"
-      "fmin v14.8h, v14.8h, v1.8h\n"
-      "fmin v15.8h, v15.8h, v1.8h\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v0.8h\n"
-      "fmax v13.8h, v13.8h, v0.8h\n"
-      "fmax v14.8h, v14.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v0.8h\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v17.8h\n"
+      "fmin v9.8h, v9.8h, v17.8h\n"
+      "fmin v10.8h, v10.8h, v17.8h\n"
+      "fmin v11.8h, v11.8h, v17.8h\n"
+      "fmin v12.8h, v12.8h, v17.8h\n"
+      "fmin v13.8h, v13.8h, v17.8h\n"
+      "fmin v14.8h, v14.8h, v17.8h\n"
+      "fmin v15.8h, v15.8h, v17.8h\n"
+      "fmax v8.8h, v8.8h, v16.8h\n"
+      "fmax v9.8h, v9.8h, v16.8h\n"
+      "fmax v10.8h, v10.8h, v16.8h\n"
+      "fmax v11.8h, v11.8h, v16.8h\n"
+      "fmax v12.8h, v12.8h, v16.8h\n"
+      "fmax v13.8h, v13.8h, v16.8h\n"
+      "fmax v14.8h, v14.8h, v16.8h\n"
+      "fmax v15.8h, v15.8h, v16.8h\n"
       "82:"  // Height 2: No activation
       "cmp x14, #0x20\n"
       "bge 99f\n"
@@ -1458,13 +1458,13 @@
       "124:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 125f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 126f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1473,8 +1473,8 @@
       "b 126f\n"
       "125:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "126:"  // Height 3: input setup done
       "cmp x27, #0x8\n"
       "blt 129f\n"
@@ -1491,139 +1491,139 @@
       "sub x27, x27, #0x8\n"
       "cmp x27, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q21, [x10, #0x0]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "add x26, x26, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "ldr q20, [x9, #0x0]\n"
       "add x25, x25, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v10.8h, v21.8h, v0.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
       "add x24, x24, #0x10\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x11, #0x30]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x12, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x11, #0x40]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x9, #0x40]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x12, #0x50]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x11, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x10, #0x50]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x9, #0x50]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x12, #0x60]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x11, #0x60]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x9, #0x60]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x12, #0x70]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v18.8h, v21.8h, v2.h[0]\n"
+      "ldr q21, [x12, #0x10]\n"
+      "fmla v11.8h, v20.8h, v0.h[0]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v2.h[0]\n"
+      "ldr q20, [x11, #0x10]\n"
+      "fmla v8.8h, v21.8h, v0.h[1]\n"
+      "fmla v12.8h, v21.8h, v1.h[1]\n"
+      "fmla v16.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x10]\n"
+      "fmla v9.8h, v20.8h, v0.h[1]\n"
+      "fmla v13.8h, v20.8h, v1.h[1]\n"
+      "fmla v17.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x9, #0x10]\n"
+      "fmla v10.8h, v21.8h, v0.h[1]\n"
+      "fmla v14.8h, v21.8h, v1.h[1]\n"
+      "fmla v18.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x12, #0x20]\n"
+      "fmla v11.8h, v20.8h, v0.h[1]\n"
+      "fmla v15.8h, v20.8h, v1.h[1]\n"
+      "fmla v19.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x11, #0x20]\n"
+      "fmla v8.8h, v21.8h, v0.h[2]\n"
+      "fmla v12.8h, v21.8h, v1.h[2]\n"
+      "fmla v16.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0x20]\n"
+      "fmla v9.8h, v20.8h, v0.h[2]\n"
+      "fmla v13.8h, v20.8h, v1.h[2]\n"
+      "fmla v17.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x9, #0x20]\n"
+      "fmla v10.8h, v21.8h, v0.h[2]\n"
+      "fmla v14.8h, v21.8h, v1.h[2]\n"
+      "fmla v18.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x12, #0x30]\n"
+      "fmla v11.8h, v20.8h, v0.h[2]\n"
+      "fmla v15.8h, v20.8h, v1.h[2]\n"
+      "fmla v19.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x11, #0x30]\n"
+      "fmla v8.8h, v21.8h, v0.h[3]\n"
+      "fmla v12.8h, v21.8h, v1.h[3]\n"
+      "fmla v16.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0x30]\n"
+      "fmla v9.8h, v20.8h, v0.h[3]\n"
+      "fmla v13.8h, v20.8h, v1.h[3]\n"
+      "fmla v17.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x9, #0x30]\n"
+      "fmla v10.8h, v21.8h, v0.h[3]\n"
+      "fmla v14.8h, v21.8h, v1.h[3]\n"
+      "fmla v18.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x12, #0x40]\n"
+      "fmla v11.8h, v20.8h, v0.h[3]\n"
+      "fmla v15.8h, v20.8h, v1.h[3]\n"
+      "fmla v19.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x11, #0x40]\n"
+      "fmla v8.8h, v21.8h, v0.h[4]\n"
+      "fmla v12.8h, v21.8h, v1.h[4]\n"
+      "fmla v16.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x10, #0x40]\n"
+      "fmla v9.8h, v20.8h, v0.h[4]\n"
+      "fmla v13.8h, v20.8h, v1.h[4]\n"
+      "fmla v17.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x9, #0x40]\n"
+      "fmla v10.8h, v21.8h, v0.h[4]\n"
+      "fmla v14.8h, v21.8h, v1.h[4]\n"
+      "fmla v18.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x12, #0x50]\n"
+      "fmla v11.8h, v20.8h, v0.h[4]\n"
+      "fmla v15.8h, v20.8h, v1.h[4]\n"
+      "fmla v19.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x11, #0x50]\n"
+      "fmla v8.8h, v21.8h, v0.h[5]\n"
+      "fmla v12.8h, v21.8h, v1.h[5]\n"
+      "fmla v16.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x10, #0x50]\n"
+      "fmla v9.8h, v20.8h, v0.h[5]\n"
+      "fmla v13.8h, v20.8h, v1.h[5]\n"
+      "fmla v17.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x9, #0x50]\n"
+      "fmla v10.8h, v21.8h, v0.h[5]\n"
+      "fmla v14.8h, v21.8h, v1.h[5]\n"
+      "fmla v18.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x12, #0x60]\n"
+      "fmla v11.8h, v20.8h, v0.h[5]\n"
+      "fmla v15.8h, v20.8h, v1.h[5]\n"
+      "fmla v19.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x11, #0x60]\n"
+      "fmla v8.8h, v21.8h, v0.h[6]\n"
+      "fmla v12.8h, v21.8h, v1.h[6]\n"
+      "fmla v16.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x10, #0x60]\n"
+      "fmla v9.8h, v20.8h, v0.h[6]\n"
+      "fmla v13.8h, v20.8h, v1.h[6]\n"
+      "fmla v17.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x9, #0x60]\n"
+      "fmla v10.8h, v21.8h, v0.h[6]\n"
+      "fmla v14.8h, v21.8h, v1.h[6]\n"
+      "fmla v18.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x12, #0x70]\n"
+      "fmla v11.8h, v20.8h, v0.h[6]\n"
       "add x12, x12, #0x80\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x11, #0x70]\n"
+      "fmla v15.8h, v20.8h, v1.h[6]\n"
+      "fmla v19.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x11, #0x70]\n"
       "add x11, x11, #0x80\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "ldr q6, [x10, #0x70]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v8.8h, v21.8h, v0.h[7]\n"
+      "fmla v12.8h, v21.8h, v1.h[7]\n"
+      "fmla v16.8h, v21.8h, v2.h[7]\n"
+      "ldr q21, [x10, #0x70]\n"
+      "fmla v9.8h, v20.8h, v0.h[7]\n"
       "add x10, x10, #0x80\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "fmla v13.8h, v20.8h, v1.h[7]\n"
+      "fmla v17.8h, v20.8h, v2.h[7]\n"
+      "ldr q20, [x9, #0x70]\n"
       "add x9, x9, #0x80\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v10.8h, v21.8h, v0.h[7]\n"
+      "fmla v14.8h, v21.8h, v1.h[7]\n"
+      "fmla v18.8h, v21.8h, v2.h[7]\n"
       "ldr q6, [x12, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v11.8h, v20.8h, v0.h[7]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v15.8h, v20.8h, v1.h[7]\n"
       "ldr q1, [x25, #0x0]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v19.8h, v20.8h, v2.h[7]\n"
       "ldr q2, [x24, #0x0]\n"
       "ldr q7, [x11, #0x0]\n"
       "bge 127b\n"
@@ -1633,162 +1633,162 @@
       "sub x27, x27, #0x8\n"
       "add x26, x26, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q21, [x10, #0x0]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "add x25, x25, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "ldr q20, [x9, #0x0]\n"
       "add x24, x24, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x11, #0x30]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x12, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x11, #0x40]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x9, #0x40]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x12, #0x50]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x11, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x10, #0x50]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x9, #0x50]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x12, #0x60]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x11, #0x60]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x9, #0x60]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x12, #0x70]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v10.8h, v21.8h, v0.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
+      "fmla v18.8h, v21.8h, v2.h[0]\n"
+      "ldr q21, [x12, #0x10]\n"
+      "fmla v11.8h, v20.8h, v0.h[0]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v2.h[0]\n"
+      "ldr q20, [x11, #0x10]\n"
+      "fmla v8.8h, v21.8h, v0.h[1]\n"
+      "fmla v12.8h, v21.8h, v1.h[1]\n"
+      "fmla v16.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x10]\n"
+      "fmla v9.8h, v20.8h, v0.h[1]\n"
+      "fmla v13.8h, v20.8h, v1.h[1]\n"
+      "fmla v17.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x9, #0x10]\n"
+      "fmla v10.8h, v21.8h, v0.h[1]\n"
+      "fmla v14.8h, v21.8h, v1.h[1]\n"
+      "fmla v18.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x12, #0x20]\n"
+      "fmla v11.8h, v20.8h, v0.h[1]\n"
+      "fmla v15.8h, v20.8h, v1.h[1]\n"
+      "fmla v19.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x11, #0x20]\n"
+      "fmla v8.8h, v21.8h, v0.h[2]\n"
+      "fmla v12.8h, v21.8h, v1.h[2]\n"
+      "fmla v16.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0x20]\n"
+      "fmla v9.8h, v20.8h, v0.h[2]\n"
+      "fmla v13.8h, v20.8h, v1.h[2]\n"
+      "fmla v17.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x9, #0x20]\n"
+      "fmla v10.8h, v21.8h, v0.h[2]\n"
+      "fmla v14.8h, v21.8h, v1.h[2]\n"
+      "fmla v18.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x12, #0x30]\n"
+      "fmla v11.8h, v20.8h, v0.h[2]\n"
+      "fmla v15.8h, v20.8h, v1.h[2]\n"
+      "fmla v19.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x11, #0x30]\n"
+      "fmla v8.8h, v21.8h, v0.h[3]\n"
+      "fmla v12.8h, v21.8h, v1.h[3]\n"
+      "fmla v16.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0x30]\n"
+      "fmla v9.8h, v20.8h, v0.h[3]\n"
+      "fmla v13.8h, v20.8h, v1.h[3]\n"
+      "fmla v17.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x9, #0x30]\n"
+      "fmla v10.8h, v21.8h, v0.h[3]\n"
+      "fmla v14.8h, v21.8h, v1.h[3]\n"
+      "fmla v18.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x12, #0x40]\n"
+      "fmla v11.8h, v20.8h, v0.h[3]\n"
+      "fmla v15.8h, v20.8h, v1.h[3]\n"
+      "fmla v19.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x11, #0x40]\n"
+      "fmla v8.8h, v21.8h, v0.h[4]\n"
+      "fmla v12.8h, v21.8h, v1.h[4]\n"
+      "fmla v16.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x10, #0x40]\n"
+      "fmla v9.8h, v20.8h, v0.h[4]\n"
+      "fmla v13.8h, v20.8h, v1.h[4]\n"
+      "fmla v17.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x9, #0x40]\n"
+      "fmla v10.8h, v21.8h, v0.h[4]\n"
+      "fmla v14.8h, v21.8h, v1.h[4]\n"
+      "fmla v18.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x12, #0x50]\n"
+      "fmla v11.8h, v20.8h, v0.h[4]\n"
+      "fmla v15.8h, v20.8h, v1.h[4]\n"
+      "fmla v19.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x11, #0x50]\n"
+      "fmla v8.8h, v21.8h, v0.h[5]\n"
+      "fmla v12.8h, v21.8h, v1.h[5]\n"
+      "fmla v16.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x10, #0x50]\n"
+      "fmla v9.8h, v20.8h, v0.h[5]\n"
+      "fmla v13.8h, v20.8h, v1.h[5]\n"
+      "fmla v17.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x9, #0x50]\n"
+      "fmla v10.8h, v21.8h, v0.h[5]\n"
+      "fmla v14.8h, v21.8h, v1.h[5]\n"
+      "fmla v18.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x12, #0x60]\n"
+      "fmla v11.8h, v20.8h, v0.h[5]\n"
+      "fmla v15.8h, v20.8h, v1.h[5]\n"
+      "fmla v19.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x11, #0x60]\n"
+      "fmla v8.8h, v21.8h, v0.h[6]\n"
+      "fmla v12.8h, v21.8h, v1.h[6]\n"
+      "fmla v16.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x10, #0x60]\n"
+      "fmla v9.8h, v20.8h, v0.h[6]\n"
+      "fmla v13.8h, v20.8h, v1.h[6]\n"
+      "fmla v17.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x9, #0x60]\n"
+      "fmla v10.8h, v21.8h, v0.h[6]\n"
+      "fmla v14.8h, v21.8h, v1.h[6]\n"
+      "fmla v18.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x12, #0x70]\n"
+      "fmla v11.8h, v20.8h, v0.h[6]\n"
       "add x12, x12, #0x80\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x11, #0x70]\n"
+      "fmla v15.8h, v20.8h, v1.h[6]\n"
+      "fmla v19.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x11, #0x70]\n"
       "add x11, x11, #0x80\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "ldr q6, [x10, #0x70]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v8.8h, v21.8h, v0.h[7]\n"
+      "fmla v12.8h, v21.8h, v1.h[7]\n"
+      "fmla v16.8h, v21.8h, v2.h[7]\n"
+      "ldr q21, [x10, #0x70]\n"
+      "fmla v9.8h, v20.8h, v0.h[7]\n"
       "add x10, x10, #0x80\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "fmla v13.8h, v20.8h, v1.h[7]\n"
+      "fmla v17.8h, v20.8h, v2.h[7]\n"
+      "ldr q20, [x9, #0x70]\n"
       "add x9, x9, #0x80\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v10.8h, v21.8h, v0.h[7]\n"
+      "fmla v14.8h, v21.8h, v1.h[7]\n"
+      "fmla v18.8h, v21.8h, v2.h[7]\n"
+      "fmla v11.8h, v20.8h, v0.h[7]\n"
+      "fmla v15.8h, v20.8h, v1.h[7]\n"
+      "fmla v19.8h, v20.8h, v2.h[7]\n"
       "129:"  // Height 3: Multiply loop: Main loop skip
       "cbz x27, 131f\n"
       "130:"  // Height 3: Multiply loop: Odd block loop
-      "ldr h0, [x26], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
       "ldr h1, [x25], #0x2\n"
       "sub x27, x27, #0x1\n"
-      "ldr h2, [x24], #0x2\n"
-      "ldr q6, [x12, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q7, [x11, #0x0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "ldr h0, [x24], #0x2\n"
+      "ldr q21, [x12, #0x0]\n"
+      "fmla v8.8h, v21.8h, v2.h[0]\n"
+      "fmla v12.8h, v21.8h, v1.h[0]\n"
+      "ldr q20, [x11, #0x0]\n"
+      "fmla v16.8h, v21.8h, v0.h[0]\n"
+      "ldr q21, [x10, #0x0]\n"
+      "fmla v9.8h, v20.8h, v2.h[0]\n"
+      "fmla v13.8h, v20.8h, v1.h[0]\n"
+      "fmla v17.8h, v20.8h, v0.h[0]\n"
+      "ldr q20, [x9, #0x0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v10.8h, v21.8h, v2.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
       "add x11, x11, #0x10\n"
       "add x10, x10, #0x10\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v18.8h, v21.8h, v0.h[0]\n"
+      "fmla v11.8h, v20.8h, v2.h[0]\n"
       "add x9, x9, #0x10\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v0.h[0]\n"
       "cbnz x27, 130b\n"
       "131:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1800,33 +1800,33 @@
       "add x24, x25, x20, LSL #1\n"
       "tbz %x[flags], #1, 132f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
+      "ld1r { v21.8h }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v1.8h\n"
-      "fmin v9.8h, v9.8h, v1.8h\n"
-      "fmin v10.8h, v10.8h, v1.8h\n"
-      "fmin v11.8h, v11.8h, v1.8h\n"
-      "fmin v12.8h, v12.8h, v1.8h\n"
-      "fmin v13.8h, v13.8h, v1.8h\n"
-      "fmin v14.8h, v14.8h, v1.8h\n"
-      "fmin v15.8h, v15.8h, v1.8h\n"
-      "fmin v16.8h, v16.8h, v1.8h\n"
-      "fmin v17.8h, v17.8h, v1.8h\n"
-      "fmin v18.8h, v18.8h, v1.8h\n"
-      "fmin v19.8h, v19.8h, v1.8h\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v0.8h\n"
-      "fmax v13.8h, v13.8h, v0.8h\n"
-      "fmax v14.8h, v14.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v0.8h\n"
-      "fmax v16.8h, v16.8h, v0.8h\n"
-      "fmax v17.8h, v17.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v0.8h\n"
-      "fmax v19.8h, v19.8h, v0.8h\n"
+      "ld1r { v20.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v21.8h\n"
+      "fmin v9.8h, v9.8h, v21.8h\n"
+      "fmin v10.8h, v10.8h, v21.8h\n"
+      "fmin v11.8h, v11.8h, v21.8h\n"
+      "fmin v12.8h, v12.8h, v21.8h\n"
+      "fmin v13.8h, v13.8h, v21.8h\n"
+      "fmin v14.8h, v14.8h, v21.8h\n"
+      "fmin v15.8h, v15.8h, v21.8h\n"
+      "fmin v16.8h, v16.8h, v21.8h\n"
+      "fmin v17.8h, v17.8h, v21.8h\n"
+      "fmin v18.8h, v18.8h, v21.8h\n"
+      "fmin v19.8h, v19.8h, v21.8h\n"
+      "fmax v8.8h, v8.8h, v20.8h\n"
+      "fmax v9.8h, v9.8h, v20.8h\n"
+      "fmax v10.8h, v10.8h, v20.8h\n"
+      "fmax v11.8h, v11.8h, v20.8h\n"
+      "fmax v12.8h, v12.8h, v20.8h\n"
+      "fmax v13.8h, v13.8h, v20.8h\n"
+      "fmax v14.8h, v14.8h, v20.8h\n"
+      "fmax v15.8h, v15.8h, v20.8h\n"
+      "fmax v16.8h, v16.8h, v20.8h\n"
+      "fmax v17.8h, v17.8h, v20.8h\n"
+      "fmax v18.8h, v18.8h, v20.8h\n"
+      "fmax v19.8h, v19.8h, v20.8h\n"
       "132:"  // Height 3: No activation
       "cmp x14, #0x20\n"
       "bge 149f\n"
@@ -2304,14 +2304,14 @@
       "174:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 175f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 176f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -2321,9 +2321,9 @@
       "b 176f\n"
       "175:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "176:"  // Height 4: input setup done
       "cmp x27, #0x8\n"
       "blt 179f\n"
@@ -2342,7 +2342,7 @@
       "cmp x27, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x0]\n"
       "add x26, x26, #0x10\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -2350,164 +2350,164 @@
       "add x24, x24, #0x10\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "ldr q24, [x9, #0x0]\n"
       "add x23, x23, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x11, #0x30]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x12, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x11, #0x40]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x9, #0x40]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x12, #0x50]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x11, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x10, #0x50]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x9, #0x50]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x12, #0x60]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x11, #0x60]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x9, #0x60]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x12, #0x70]\n"
+      "fmla v10.8h, v25.8h, v0.h[0]\n"
+      "fmla v14.8h, v25.8h, v1.h[0]\n"
+      "fmla v18.8h, v25.8h, v2.h[0]\n"
+      "fmla v22.8h, v25.8h, v3.h[0]\n"
+      "ldr q25, [x12, #0x10]\n"
+      "fmla v11.8h, v24.8h, v0.h[0]\n"
+      "fmla v15.8h, v24.8h, v1.h[0]\n"
+      "fmla v19.8h, v24.8h, v2.h[0]\n"
+      "fmla v23.8h, v24.8h, v3.h[0]\n"
+      "ldr q24, [x11, #0x10]\n"
+      "fmla v8.8h, v25.8h, v0.h[1]\n"
+      "fmla v12.8h, v25.8h, v1.h[1]\n"
+      "fmla v16.8h, v25.8h, v2.h[1]\n"
+      "fmla v20.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x10]\n"
+      "fmla v9.8h, v24.8h, v0.h[1]\n"
+      "fmla v13.8h, v24.8h, v1.h[1]\n"
+      "fmla v17.8h, v24.8h, v2.h[1]\n"
+      "fmla v21.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x9, #0x10]\n"
+      "fmla v10.8h, v25.8h, v0.h[1]\n"
+      "fmla v14.8h, v25.8h, v1.h[1]\n"
+      "fmla v18.8h, v25.8h, v2.h[1]\n"
+      "fmla v22.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x12, #0x20]\n"
+      "fmla v11.8h, v24.8h, v0.h[1]\n"
+      "fmla v15.8h, v24.8h, v1.h[1]\n"
+      "fmla v19.8h, v24.8h, v2.h[1]\n"
+      "fmla v23.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x11, #0x20]\n"
+      "fmla v8.8h, v25.8h, v0.h[2]\n"
+      "fmla v12.8h, v25.8h, v1.h[2]\n"
+      "fmla v16.8h, v25.8h, v2.h[2]\n"
+      "fmla v20.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0x20]\n"
+      "fmla v9.8h, v24.8h, v0.h[2]\n"
+      "fmla v13.8h, v24.8h, v1.h[2]\n"
+      "fmla v17.8h, v24.8h, v2.h[2]\n"
+      "fmla v21.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x9, #0x20]\n"
+      "fmla v10.8h, v25.8h, v0.h[2]\n"
+      "fmla v14.8h, v25.8h, v1.h[2]\n"
+      "fmla v18.8h, v25.8h, v2.h[2]\n"
+      "fmla v22.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x12, #0x30]\n"
+      "fmla v11.8h, v24.8h, v0.h[2]\n"
+      "fmla v15.8h, v24.8h, v1.h[2]\n"
+      "fmla v19.8h, v24.8h, v2.h[2]\n"
+      "fmla v23.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x11, #0x30]\n"
+      "fmla v8.8h, v25.8h, v0.h[3]\n"
+      "fmla v12.8h, v25.8h, v1.h[3]\n"
+      "fmla v16.8h, v25.8h, v2.h[3]\n"
+      "fmla v20.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0x30]\n"
+      "fmla v9.8h, v24.8h, v0.h[3]\n"
+      "fmla v13.8h, v24.8h, v1.h[3]\n"
+      "fmla v17.8h, v24.8h, v2.h[3]\n"
+      "fmla v21.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x9, #0x30]\n"
+      "fmla v10.8h, v25.8h, v0.h[3]\n"
+      "fmla v14.8h, v25.8h, v1.h[3]\n"
+      "fmla v18.8h, v25.8h, v2.h[3]\n"
+      "fmla v22.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x12, #0x40]\n"
+      "fmla v11.8h, v24.8h, v0.h[3]\n"
+      "fmla v15.8h, v24.8h, v1.h[3]\n"
+      "fmla v19.8h, v24.8h, v2.h[3]\n"
+      "fmla v23.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x11, #0x40]\n"
+      "fmla v8.8h, v25.8h, v0.h[4]\n"
+      "fmla v12.8h, v25.8h, v1.h[4]\n"
+      "fmla v16.8h, v25.8h, v2.h[4]\n"
+      "fmla v20.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x10, #0x40]\n"
+      "fmla v9.8h, v24.8h, v0.h[4]\n"
+      "fmla v13.8h, v24.8h, v1.h[4]\n"
+      "fmla v17.8h, v24.8h, v2.h[4]\n"
+      "fmla v21.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x9, #0x40]\n"
+      "fmla v10.8h, v25.8h, v0.h[4]\n"
+      "fmla v14.8h, v25.8h, v1.h[4]\n"
+      "fmla v18.8h, v25.8h, v2.h[4]\n"
+      "fmla v22.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x12, #0x50]\n"
+      "fmla v11.8h, v24.8h, v0.h[4]\n"
+      "fmla v15.8h, v24.8h, v1.h[4]\n"
+      "fmla v19.8h, v24.8h, v2.h[4]\n"
+      "fmla v23.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x11, #0x50]\n"
+      "fmla v8.8h, v25.8h, v0.h[5]\n"
+      "fmla v12.8h, v25.8h, v1.h[5]\n"
+      "fmla v16.8h, v25.8h, v2.h[5]\n"
+      "fmla v20.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x10, #0x50]\n"
+      "fmla v9.8h, v24.8h, v0.h[5]\n"
+      "fmla v13.8h, v24.8h, v1.h[5]\n"
+      "fmla v17.8h, v24.8h, v2.h[5]\n"
+      "fmla v21.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x9, #0x50]\n"
+      "fmla v10.8h, v25.8h, v0.h[5]\n"
+      "fmla v14.8h, v25.8h, v1.h[5]\n"
+      "fmla v18.8h, v25.8h, v2.h[5]\n"
+      "fmla v22.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x12, #0x60]\n"
+      "fmla v11.8h, v24.8h, v0.h[5]\n"
+      "fmla v15.8h, v24.8h, v1.h[5]\n"
+      "fmla v19.8h, v24.8h, v2.h[5]\n"
+      "fmla v23.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x11, #0x60]\n"
+      "fmla v8.8h, v25.8h, v0.h[6]\n"
+      "fmla v12.8h, v25.8h, v1.h[6]\n"
+      "fmla v16.8h, v25.8h, v2.h[6]\n"
+      "fmla v20.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x10, #0x60]\n"
+      "fmla v9.8h, v24.8h, v0.h[6]\n"
+      "fmla v13.8h, v24.8h, v1.h[6]\n"
+      "fmla v17.8h, v24.8h, v2.h[6]\n"
+      "fmla v21.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x9, #0x60]\n"
+      "fmla v10.8h, v25.8h, v0.h[6]\n"
+      "fmla v14.8h, v25.8h, v1.h[6]\n"
+      "fmla v18.8h, v25.8h, v2.h[6]\n"
+      "fmla v22.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x12, #0x70]\n"
       "add x12, x12, #0x80\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x11, #0x70]\n"
+      "fmla v11.8h, v24.8h, v0.h[6]\n"
+      "fmla v15.8h, v24.8h, v1.h[6]\n"
+      "fmla v19.8h, v24.8h, v2.h[6]\n"
+      "fmla v23.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x11, #0x70]\n"
       "add x11, x11, #0x80\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "ldr q6, [x10, #0x70]\n"
+      "fmla v8.8h, v25.8h, v0.h[7]\n"
+      "fmla v12.8h, v25.8h, v1.h[7]\n"
+      "fmla v16.8h, v25.8h, v2.h[7]\n"
+      "fmla v20.8h, v25.8h, v3.h[7]\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "fmla v9.8h, v24.8h, v0.h[7]\n"
+      "fmla v13.8h, v24.8h, v1.h[7]\n"
+      "fmla v17.8h, v24.8h, v2.h[7]\n"
+      "fmla v21.8h, v24.8h, v3.h[7]\n"
+      "ldr q24, [x9, #0x70]\n"
       "add x9, x9, #0x80\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v10.8h, v25.8h, v0.h[7]\n"
+      "fmla v14.8h, v25.8h, v1.h[7]\n"
+      "fmla v18.8h, v25.8h, v2.h[7]\n"
+      "fmla v22.8h, v25.8h, v3.h[7]\n"
       "ldr q6, [x12, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v11.8h, v24.8h, v0.h[7]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v15.8h, v24.8h, v1.h[7]\n"
       "ldr q1, [x25, #0x0]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v19.8h, v24.8h, v2.h[7]\n"
       "ldr q2, [x24, #0x0]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v23.8h, v24.8h, v3.h[7]\n"
       "ldr q3, [x23, #0x0]\n"
       "ldr q7, [x11, #0x0]\n"
       "bge 177b\n"
@@ -2518,7 +2518,7 @@
       "add x26, x26, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x0]\n"
       "add x25, x25, #0x10\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -2526,191 +2526,191 @@
       "add x23, x23, #0x10\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x11, #0x30]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x12, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x11, #0x40]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x9, #0x40]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x12, #0x50]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x11, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x10, #0x50]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x9, #0x50]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x12, #0x60]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x11, #0x60]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x9, #0x60]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x12, #0x70]\n"
+      "ldr q24, [x9, #0x0]\n"
+      "fmla v10.8h, v25.8h, v0.h[0]\n"
+      "fmla v14.8h, v25.8h, v1.h[0]\n"
+      "fmla v18.8h, v25.8h, v2.h[0]\n"
+      "fmla v22.8h, v25.8h, v3.h[0]\n"
+      "ldr q25, [x12, #0x10]\n"
+      "fmla v11.8h, v24.8h, v0.h[0]\n"
+      "fmla v15.8h, v24.8h, v1.h[0]\n"
+      "fmla v19.8h, v24.8h, v2.h[0]\n"
+      "fmla v23.8h, v24.8h, v3.h[0]\n"
+      "ldr q24, [x11, #0x10]\n"
+      "fmla v8.8h, v25.8h, v0.h[1]\n"
+      "fmla v12.8h, v25.8h, v1.h[1]\n"
+      "fmla v16.8h, v25.8h, v2.h[1]\n"
+      "fmla v20.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x10]\n"
+      "fmla v9.8h, v24.8h, v0.h[1]\n"
+      "fmla v13.8h, v24.8h, v1.h[1]\n"
+      "fmla v17.8h, v24.8h, v2.h[1]\n"
+      "fmla v21.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x9, #0x10]\n"
+      "fmla v10.8h, v25.8h, v0.h[1]\n"
+      "fmla v14.8h, v25.8h, v1.h[1]\n"
+      "fmla v18.8h, v25.8h, v2.h[1]\n"
+      "fmla v22.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x12, #0x20]\n"
+      "fmla v11.8h, v24.8h, v0.h[1]\n"
+      "fmla v15.8h, v24.8h, v1.h[1]\n"
+      "fmla v19.8h, v24.8h, v2.h[1]\n"
+      "fmla v23.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x11, #0x20]\n"
+      "fmla v8.8h, v25.8h, v0.h[2]\n"
+      "fmla v12.8h, v25.8h, v1.h[2]\n"
+      "fmla v16.8h, v25.8h, v2.h[2]\n"
+      "fmla v20.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0x20]\n"
+      "fmla v9.8h, v24.8h, v0.h[2]\n"
+      "fmla v13.8h, v24.8h, v1.h[2]\n"
+      "fmla v17.8h, v24.8h, v2.h[2]\n"
+      "fmla v21.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x9, #0x20]\n"
+      "fmla v10.8h, v25.8h, v0.h[2]\n"
+      "fmla v14.8h, v25.8h, v1.h[2]\n"
+      "fmla v18.8h, v25.8h, v2.h[2]\n"
+      "fmla v22.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x12, #0x30]\n"
+      "fmla v11.8h, v24.8h, v0.h[2]\n"
+      "fmla v15.8h, v24.8h, v1.h[2]\n"
+      "fmla v19.8h, v24.8h, v2.h[2]\n"
+      "fmla v23.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x11, #0x30]\n"
+      "fmla v8.8h, v25.8h, v0.h[3]\n"
+      "fmla v12.8h, v25.8h, v1.h[3]\n"
+      "fmla v16.8h, v25.8h, v2.h[3]\n"
+      "fmla v20.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0x30]\n"
+      "fmla v9.8h, v24.8h, v0.h[3]\n"
+      "fmla v13.8h, v24.8h, v1.h[3]\n"
+      "fmla v17.8h, v24.8h, v2.h[3]\n"
+      "fmla v21.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x9, #0x30]\n"
+      "fmla v10.8h, v25.8h, v0.h[3]\n"
+      "fmla v14.8h, v25.8h, v1.h[3]\n"
+      "fmla v18.8h, v25.8h, v2.h[3]\n"
+      "fmla v22.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x12, #0x40]\n"
+      "fmla v11.8h, v24.8h, v0.h[3]\n"
+      "fmla v15.8h, v24.8h, v1.h[3]\n"
+      "fmla v19.8h, v24.8h, v2.h[3]\n"
+      "fmla v23.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x11, #0x40]\n"
+      "fmla v8.8h, v25.8h, v0.h[4]\n"
+      "fmla v12.8h, v25.8h, v1.h[4]\n"
+      "fmla v16.8h, v25.8h, v2.h[4]\n"
+      "fmla v20.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x10, #0x40]\n"
+      "fmla v9.8h, v24.8h, v0.h[4]\n"
+      "fmla v13.8h, v24.8h, v1.h[4]\n"
+      "fmla v17.8h, v24.8h, v2.h[4]\n"
+      "fmla v21.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x9, #0x40]\n"
+      "fmla v10.8h, v25.8h, v0.h[4]\n"
+      "fmla v14.8h, v25.8h, v1.h[4]\n"
+      "fmla v18.8h, v25.8h, v2.h[4]\n"
+      "fmla v22.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x12, #0x50]\n"
+      "fmla v11.8h, v24.8h, v0.h[4]\n"
+      "fmla v15.8h, v24.8h, v1.h[4]\n"
+      "fmla v19.8h, v24.8h, v2.h[4]\n"
+      "fmla v23.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x11, #0x50]\n"
+      "fmla v8.8h, v25.8h, v0.h[5]\n"
+      "fmla v12.8h, v25.8h, v1.h[5]\n"
+      "fmla v16.8h, v25.8h, v2.h[5]\n"
+      "fmla v20.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x10, #0x50]\n"
+      "fmla v9.8h, v24.8h, v0.h[5]\n"
+      "fmla v13.8h, v24.8h, v1.h[5]\n"
+      "fmla v17.8h, v24.8h, v2.h[5]\n"
+      "fmla v21.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x9, #0x50]\n"
+      "fmla v10.8h, v25.8h, v0.h[5]\n"
+      "fmla v14.8h, v25.8h, v1.h[5]\n"
+      "fmla v18.8h, v25.8h, v2.h[5]\n"
+      "fmla v22.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x12, #0x60]\n"
+      "fmla v11.8h, v24.8h, v0.h[5]\n"
+      "fmla v15.8h, v24.8h, v1.h[5]\n"
+      "fmla v19.8h, v24.8h, v2.h[5]\n"
+      "fmla v23.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x11, #0x60]\n"
+      "fmla v8.8h, v25.8h, v0.h[6]\n"
+      "fmla v12.8h, v25.8h, v1.h[6]\n"
+      "fmla v16.8h, v25.8h, v2.h[6]\n"
+      "fmla v20.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x10, #0x60]\n"
+      "fmla v9.8h, v24.8h, v0.h[6]\n"
+      "fmla v13.8h, v24.8h, v1.h[6]\n"
+      "fmla v17.8h, v24.8h, v2.h[6]\n"
+      "fmla v21.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x9, #0x60]\n"
+      "fmla v10.8h, v25.8h, v0.h[6]\n"
+      "fmla v14.8h, v25.8h, v1.h[6]\n"
+      "fmla v18.8h, v25.8h, v2.h[6]\n"
+      "fmla v22.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x12, #0x70]\n"
       "add x12, x12, #0x80\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x11, #0x70]\n"
+      "fmla v11.8h, v24.8h, v0.h[6]\n"
+      "fmla v15.8h, v24.8h, v1.h[6]\n"
+      "fmla v19.8h, v24.8h, v2.h[6]\n"
+      "fmla v23.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x11, #0x70]\n"
       "add x11, x11, #0x80\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "ldr q6, [x10, #0x70]\n"
+      "fmla v8.8h, v25.8h, v0.h[7]\n"
+      "fmla v12.8h, v25.8h, v1.h[7]\n"
+      "fmla v16.8h, v25.8h, v2.h[7]\n"
+      "fmla v20.8h, v25.8h, v3.h[7]\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "fmla v9.8h, v24.8h, v0.h[7]\n"
+      "fmla v13.8h, v24.8h, v1.h[7]\n"
+      "fmla v17.8h, v24.8h, v2.h[7]\n"
+      "fmla v21.8h, v24.8h, v3.h[7]\n"
+      "ldr q24, [x9, #0x70]\n"
       "add x9, x9, #0x80\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v10.8h, v25.8h, v0.h[7]\n"
+      "fmla v14.8h, v25.8h, v1.h[7]\n"
+      "fmla v18.8h, v25.8h, v2.h[7]\n"
+      "fmla v22.8h, v25.8h, v3.h[7]\n"
+      "fmla v11.8h, v24.8h, v0.h[7]\n"
+      "fmla v15.8h, v24.8h, v1.h[7]\n"
+      "fmla v19.8h, v24.8h, v2.h[7]\n"
+      "fmla v23.8h, v24.8h, v3.h[7]\n"
       "179:"  // Height 4: Multiply loop: Main loop skip
       "cbz x27, 181f\n"
       "180:"  // Height 4: Multiply loop: Odd block loop
-      "ldr h0, [x26], #0x2\n"
-      "ldr h1, [x25], #0x2\n"
+      "ldr h3, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
       "sub x27, x27, #0x1\n"
-      "ldr h2, [x24], #0x2\n"
-      "ldr h3, [x23], #0x2\n"
-      "ldr q6, [x12, #0x0]\n"
-      "ldr q7, [x11, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h0, [x23], #0x2\n"
+      "ldr q25, [x12, #0x0]\n"
+      "ldr q24, [x11, #0x0]\n"
+      "fmla v8.8h, v25.8h, v3.h[0]\n"
+      "fmla v12.8h, v25.8h, v2.h[0]\n"
+      "fmla v16.8h, v25.8h, v1.h[0]\n"
+      "fmla v20.8h, v25.8h, v0.h[0]\n"
+      "ldr q25, [x10, #0x0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "fmla v9.8h, v24.8h, v3.h[0]\n"
+      "fmla v13.8h, v24.8h, v2.h[0]\n"
       "add x11, x11, #0x10\n"
       "add x10, x10, #0x10\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "fmla v17.8h, v24.8h, v1.h[0]\n"
+      "fmla v21.8h, v24.8h, v0.h[0]\n"
+      "ldr q24, [x9, #0x0]\n"
       "add x9, x9, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v10.8h, v25.8h, v3.h[0]\n"
+      "fmla v14.8h, v25.8h, v2.h[0]\n"
+      "fmla v18.8h, v25.8h, v1.h[0]\n"
+      "fmla v22.8h, v25.8h, v0.h[0]\n"
+      "fmla v11.8h, v24.8h, v3.h[0]\n"
+      "fmla v15.8h, v24.8h, v2.h[0]\n"
+      "fmla v19.8h, v24.8h, v1.h[0]\n"
+      "fmla v23.8h, v24.8h, v0.h[0]\n"
       "cbnz x27, 180b\n"
       "181:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2723,41 +2723,41 @@
       "add x23, x24, x20, LSL #1\n"
       "tbz %x[flags], #1, 182f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
+      "ld1r { v25.8h }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v1.8h\n"
-      "fmin v9.8h, v9.8h, v1.8h\n"
-      "fmin v10.8h, v10.8h, v1.8h\n"
-      "fmin v11.8h, v11.8h, v1.8h\n"
-      "fmin v12.8h, v12.8h, v1.8h\n"
-      "fmin v13.8h, v13.8h, v1.8h\n"
-      "fmin v14.8h, v14.8h, v1.8h\n"
-      "fmin v15.8h, v15.8h, v1.8h\n"
-      "fmin v16.8h, v16.8h, v1.8h\n"
-      "fmin v17.8h, v17.8h, v1.8h\n"
-      "fmin v18.8h, v18.8h, v1.8h\n"
-      "fmin v19.8h, v19.8h, v1.8h\n"
-      "fmin v20.8h, v20.8h, v1.8h\n"
-      "fmin v21.8h, v21.8h, v1.8h\n"
-      "fmin v22.8h, v22.8h, v1.8h\n"
-      "fmin v23.8h, v23.8h, v1.8h\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v0.8h\n"
-      "fmax v13.8h, v13.8h, v0.8h\n"
-      "fmax v14.8h, v14.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v0.8h\n"
-      "fmax v16.8h, v16.8h, v0.8h\n"
-      "fmax v17.8h, v17.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v0.8h\n"
-      "fmax v19.8h, v19.8h, v0.8h\n"
-      "fmax v20.8h, v20.8h, v0.8h\n"
-      "fmax v21.8h, v21.8h, v0.8h\n"
-      "fmax v22.8h, v22.8h, v0.8h\n"
-      "fmax v23.8h, v23.8h, v0.8h\n"
+      "ld1r { v24.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v25.8h\n"
+      "fmin v9.8h, v9.8h, v25.8h\n"
+      "fmin v10.8h, v10.8h, v25.8h\n"
+      "fmin v11.8h, v11.8h, v25.8h\n"
+      "fmin v12.8h, v12.8h, v25.8h\n"
+      "fmin v13.8h, v13.8h, v25.8h\n"
+      "fmin v14.8h, v14.8h, v25.8h\n"
+      "fmin v15.8h, v15.8h, v25.8h\n"
+      "fmin v16.8h, v16.8h, v25.8h\n"
+      "fmin v17.8h, v17.8h, v25.8h\n"
+      "fmin v18.8h, v18.8h, v25.8h\n"
+      "fmin v19.8h, v19.8h, v25.8h\n"
+      "fmin v20.8h, v20.8h, v25.8h\n"
+      "fmin v21.8h, v21.8h, v25.8h\n"
+      "fmin v22.8h, v22.8h, v25.8h\n"
+      "fmin v23.8h, v23.8h, v25.8h\n"
+      "fmax v8.8h, v8.8h, v24.8h\n"
+      "fmax v9.8h, v9.8h, v24.8h\n"
+      "fmax v10.8h, v10.8h, v24.8h\n"
+      "fmax v11.8h, v11.8h, v24.8h\n"
+      "fmax v12.8h, v12.8h, v24.8h\n"
+      "fmax v13.8h, v13.8h, v24.8h\n"
+      "fmax v14.8h, v14.8h, v24.8h\n"
+      "fmax v15.8h, v15.8h, v24.8h\n"
+      "fmax v16.8h, v16.8h, v24.8h\n"
+      "fmax v17.8h, v17.8h, v24.8h\n"
+      "fmax v18.8h, v18.8h, v24.8h\n"
+      "fmax v19.8h, v19.8h, v24.8h\n"
+      "fmax v20.8h, v20.8h, v24.8h\n"
+      "fmax v21.8h, v21.8h, v24.8h\n"
+      "fmax v22.8h, v22.8h, v24.8h\n"
+      "fmax v23.8h, v23.8h, v24.8h\n"
       "182:"  // Height 4: No activation
       "cmp x14, #0x20\n"
       "bge 199f\n"
@@ -3316,15 +3316,15 @@
       "224:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 225f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 226f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -3335,10 +3335,10 @@
       "b 226f\n"
       "225:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "226:"  // Height 5: input setup done
       "cmp x27, #0x8\n"
       "blt 229f\n"
@@ -3361,7 +3361,7 @@
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q29, [x10, #0x0]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "add x24, x24, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -3370,194 +3370,194 @@
       "add x22, x22, #0x10\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "fmla v24.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "fmla v25.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "fmla v26.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "fmla v27.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "fmla v24.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "fmla v25.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "fmla v26.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "fmla v27.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x11, #0x30]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "fmla v24.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "fmla v25.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "fmla v26.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x12, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "fmla v27.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x11, #0x40]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "fmla v24.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "fmla v25.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x9, #0x40]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "fmla v26.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x12, #0x50]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "fmla v27.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x11, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "fmla v24.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x10, #0x50]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "fmla v25.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x9, #0x50]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "fmla v26.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x12, #0x60]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "fmla v27.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x11, #0x60]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "fmla v24.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "fmla v25.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x9, #0x60]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "fmla v26.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x12, #0x70]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "ldr q28, [x9, #0x0]\n"
+      "fmla v10.8h, v29.8h, v0.h[0]\n"
+      "fmla v14.8h, v29.8h, v1.h[0]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "fmla v22.8h, v29.8h, v3.h[0]\n"
+      "fmla v26.8h, v29.8h, v4.h[0]\n"
+      "ldr q29, [x12, #0x10]\n"
+      "fmla v11.8h, v28.8h, v0.h[0]\n"
+      "fmla v15.8h, v28.8h, v1.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v3.h[0]\n"
+      "fmla v27.8h, v28.8h, v4.h[0]\n"
+      "ldr q28, [x11, #0x10]\n"
+      "fmla v8.8h, v29.8h, v0.h[1]\n"
+      "fmla v12.8h, v29.8h, v1.h[1]\n"
+      "fmla v16.8h, v29.8h, v2.h[1]\n"
+      "fmla v20.8h, v29.8h, v3.h[1]\n"
+      "fmla v24.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x10]\n"
+      "fmla v9.8h, v28.8h, v0.h[1]\n"
+      "fmla v13.8h, v28.8h, v1.h[1]\n"
+      "fmla v17.8h, v28.8h, v2.h[1]\n"
+      "fmla v21.8h, v28.8h, v3.h[1]\n"
+      "fmla v25.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x9, #0x10]\n"
+      "fmla v10.8h, v29.8h, v0.h[1]\n"
+      "fmla v14.8h, v29.8h, v1.h[1]\n"
+      "fmla v18.8h, v29.8h, v2.h[1]\n"
+      "fmla v22.8h, v29.8h, v3.h[1]\n"
+      "fmla v26.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x12, #0x20]\n"
+      "fmla v11.8h, v28.8h, v0.h[1]\n"
+      "fmla v15.8h, v28.8h, v1.h[1]\n"
+      "fmla v19.8h, v28.8h, v2.h[1]\n"
+      "fmla v23.8h, v28.8h, v3.h[1]\n"
+      "fmla v27.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x11, #0x20]\n"
+      "fmla v8.8h, v29.8h, v0.h[2]\n"
+      "fmla v12.8h, v29.8h, v1.h[2]\n"
+      "fmla v16.8h, v29.8h, v2.h[2]\n"
+      "fmla v20.8h, v29.8h, v3.h[2]\n"
+      "fmla v24.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0x20]\n"
+      "fmla v9.8h, v28.8h, v0.h[2]\n"
+      "fmla v13.8h, v28.8h, v1.h[2]\n"
+      "fmla v17.8h, v28.8h, v2.h[2]\n"
+      "fmla v21.8h, v28.8h, v3.h[2]\n"
+      "fmla v25.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x9, #0x20]\n"
+      "fmla v10.8h, v29.8h, v0.h[2]\n"
+      "fmla v14.8h, v29.8h, v1.h[2]\n"
+      "fmla v18.8h, v29.8h, v2.h[2]\n"
+      "fmla v22.8h, v29.8h, v3.h[2]\n"
+      "fmla v26.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x12, #0x30]\n"
+      "fmla v11.8h, v28.8h, v0.h[2]\n"
+      "fmla v15.8h, v28.8h, v1.h[2]\n"
+      "fmla v19.8h, v28.8h, v2.h[2]\n"
+      "fmla v23.8h, v28.8h, v3.h[2]\n"
+      "fmla v27.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x11, #0x30]\n"
+      "fmla v8.8h, v29.8h, v0.h[3]\n"
+      "fmla v12.8h, v29.8h, v1.h[3]\n"
+      "fmla v16.8h, v29.8h, v2.h[3]\n"
+      "fmla v20.8h, v29.8h, v3.h[3]\n"
+      "fmla v24.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0x30]\n"
+      "fmla v9.8h, v28.8h, v0.h[3]\n"
+      "fmla v13.8h, v28.8h, v1.h[3]\n"
+      "fmla v17.8h, v28.8h, v2.h[3]\n"
+      "fmla v21.8h, v28.8h, v3.h[3]\n"
+      "fmla v25.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x9, #0x30]\n"
+      "fmla v10.8h, v29.8h, v0.h[3]\n"
+      "fmla v14.8h, v29.8h, v1.h[3]\n"
+      "fmla v18.8h, v29.8h, v2.h[3]\n"
+      "fmla v22.8h, v29.8h, v3.h[3]\n"
+      "fmla v26.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x12, #0x40]\n"
+      "fmla v11.8h, v28.8h, v0.h[3]\n"
+      "fmla v15.8h, v28.8h, v1.h[3]\n"
+      "fmla v19.8h, v28.8h, v2.h[3]\n"
+      "fmla v23.8h, v28.8h, v3.h[3]\n"
+      "fmla v27.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x11, #0x40]\n"
+      "fmla v8.8h, v29.8h, v0.h[4]\n"
+      "fmla v12.8h, v29.8h, v1.h[4]\n"
+      "fmla v16.8h, v29.8h, v2.h[4]\n"
+      "fmla v20.8h, v29.8h, v3.h[4]\n"
+      "fmla v24.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x10, #0x40]\n"
+      "fmla v9.8h, v28.8h, v0.h[4]\n"
+      "fmla v13.8h, v28.8h, v1.h[4]\n"
+      "fmla v17.8h, v28.8h, v2.h[4]\n"
+      "fmla v21.8h, v28.8h, v3.h[4]\n"
+      "fmla v25.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x9, #0x40]\n"
+      "fmla v10.8h, v29.8h, v0.h[4]\n"
+      "fmla v14.8h, v29.8h, v1.h[4]\n"
+      "fmla v18.8h, v29.8h, v2.h[4]\n"
+      "fmla v22.8h, v29.8h, v3.h[4]\n"
+      "fmla v26.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x12, #0x50]\n"
+      "fmla v11.8h, v28.8h, v0.h[4]\n"
+      "fmla v15.8h, v28.8h, v1.h[4]\n"
+      "fmla v19.8h, v28.8h, v2.h[4]\n"
+      "fmla v23.8h, v28.8h, v3.h[4]\n"
+      "fmla v27.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x11, #0x50]\n"
+      "fmla v8.8h, v29.8h, v0.h[5]\n"
+      "fmla v12.8h, v29.8h, v1.h[5]\n"
+      "fmla v16.8h, v29.8h, v2.h[5]\n"
+      "fmla v20.8h, v29.8h, v3.h[5]\n"
+      "fmla v24.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x10, #0x50]\n"
+      "fmla v9.8h, v28.8h, v0.h[5]\n"
+      "fmla v13.8h, v28.8h, v1.h[5]\n"
+      "fmla v17.8h, v28.8h, v2.h[5]\n"
+      "fmla v21.8h, v28.8h, v3.h[5]\n"
+      "fmla v25.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x9, #0x50]\n"
+      "fmla v10.8h, v29.8h, v0.h[5]\n"
+      "fmla v14.8h, v29.8h, v1.h[5]\n"
+      "fmla v18.8h, v29.8h, v2.h[5]\n"
+      "fmla v22.8h, v29.8h, v3.h[5]\n"
+      "fmla v26.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x12, #0x60]\n"
+      "fmla v11.8h, v28.8h, v0.h[5]\n"
+      "fmla v15.8h, v28.8h, v1.h[5]\n"
+      "fmla v19.8h, v28.8h, v2.h[5]\n"
+      "fmla v23.8h, v28.8h, v3.h[5]\n"
+      "fmla v27.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x11, #0x60]\n"
+      "fmla v8.8h, v29.8h, v0.h[6]\n"
+      "fmla v12.8h, v29.8h, v1.h[6]\n"
+      "fmla v16.8h, v29.8h, v2.h[6]\n"
+      "fmla v20.8h, v29.8h, v3.h[6]\n"
+      "fmla v24.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x10, #0x60]\n"
+      "fmla v9.8h, v28.8h, v0.h[6]\n"
+      "fmla v13.8h, v28.8h, v1.h[6]\n"
+      "fmla v17.8h, v28.8h, v2.h[6]\n"
+      "fmla v21.8h, v28.8h, v3.h[6]\n"
+      "fmla v25.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x9, #0x60]\n"
+      "fmla v10.8h, v29.8h, v0.h[6]\n"
+      "fmla v14.8h, v29.8h, v1.h[6]\n"
+      "fmla v18.8h, v29.8h, v2.h[6]\n"
+      "fmla v22.8h, v29.8h, v3.h[6]\n"
+      "fmla v26.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x12, #0x70]\n"
+      "fmla v11.8h, v28.8h, v0.h[6]\n"
       "add x12, x12, #0x80\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "fmla v27.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x11, #0x70]\n"
+      "fmla v15.8h, v28.8h, v1.h[6]\n"
+      "fmla v19.8h, v28.8h, v2.h[6]\n"
+      "fmla v23.8h, v28.8h, v3.h[6]\n"
+      "fmla v27.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x11, #0x70]\n"
       "add x11, x11, #0x80\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "fmla v24.8h, v6.8h, v4.h[7]\n"
-      "ldr q6, [x10, #0x70]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v8.8h, v29.8h, v0.h[7]\n"
+      "fmla v12.8h, v29.8h, v1.h[7]\n"
+      "fmla v16.8h, v29.8h, v2.h[7]\n"
+      "fmla v20.8h, v29.8h, v3.h[7]\n"
+      "fmla v24.8h, v29.8h, v4.h[7]\n"
+      "ldr q29, [x10, #0x70]\n"
+      "fmla v9.8h, v28.8h, v0.h[7]\n"
       "add x10, x10, #0x80\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "fmla v25.8h, v7.8h, v4.h[7]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "fmla v13.8h, v28.8h, v1.h[7]\n"
+      "fmla v17.8h, v28.8h, v2.h[7]\n"
+      "fmla v21.8h, v28.8h, v3.h[7]\n"
+      "fmla v25.8h, v28.8h, v4.h[7]\n"
+      "ldr q28, [x9, #0x70]\n"
       "add x9, x9, #0x80\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v10.8h, v29.8h, v0.h[7]\n"
+      "fmla v14.8h, v29.8h, v1.h[7]\n"
+      "fmla v18.8h, v29.8h, v2.h[7]\n"
+      "fmla v22.8h, v29.8h, v3.h[7]\n"
+      "fmla v26.8h, v29.8h, v4.h[7]\n"
       "ldr q6, [x12, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v11.8h, v28.8h, v0.h[7]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v15.8h, v28.8h, v1.h[7]\n"
       "ldr q1, [x25, #0x0]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v19.8h, v28.8h, v2.h[7]\n"
       "ldr q2, [x24, #0x0]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v23.8h, v28.8h, v3.h[7]\n"
       "ldr q3, [x23, #0x0]\n"
-      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "fmla v27.8h, v28.8h, v4.h[7]\n"
       "ldr q4, [x22, #0x0]\n"
       "ldr q7, [x11, #0x0]\n"
       "bge 227b\n"
@@ -3571,7 +3571,7 @@
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q29, [x10, #0x0]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "add x23, x23, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -3579,226 +3579,226 @@
       "add x22, x22, #0x10\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "fmla v24.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "fmla v25.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "fmla v26.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "fmla v27.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "fmla v24.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "fmla v25.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "fmla v26.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "fmla v27.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x11, #0x30]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "fmla v24.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "fmla v25.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x9, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "fmla v26.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x12, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "fmla v27.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x11, #0x40]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "fmla v24.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "fmla v25.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x9, #0x40]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "fmla v26.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x12, #0x50]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "fmla v27.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x11, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "fmla v24.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x10, #0x50]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "fmla v25.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x9, #0x50]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "fmla v26.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x12, #0x60]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "fmla v27.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x11, #0x60]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "fmla v24.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "fmla v25.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x9, #0x60]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "fmla v26.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x12, #0x70]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "ldr q28, [x9, #0x0]\n"
+      "fmla v10.8h, v29.8h, v0.h[0]\n"
+      "fmla v14.8h, v29.8h, v1.h[0]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "fmla v22.8h, v29.8h, v3.h[0]\n"
+      "fmla v26.8h, v29.8h, v4.h[0]\n"
+      "ldr q29, [x12, #0x10]\n"
+      "fmla v11.8h, v28.8h, v0.h[0]\n"
+      "fmla v15.8h, v28.8h, v1.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v3.h[0]\n"
+      "fmla v27.8h, v28.8h, v4.h[0]\n"
+      "ldr q28, [x11, #0x10]\n"
+      "fmla v8.8h, v29.8h, v0.h[1]\n"
+      "fmla v12.8h, v29.8h, v1.h[1]\n"
+      "fmla v16.8h, v29.8h, v2.h[1]\n"
+      "fmla v20.8h, v29.8h, v3.h[1]\n"
+      "fmla v24.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x10]\n"
+      "fmla v9.8h, v28.8h, v0.h[1]\n"
+      "fmla v13.8h, v28.8h, v1.h[1]\n"
+      "fmla v17.8h, v28.8h, v2.h[1]\n"
+      "fmla v21.8h, v28.8h, v3.h[1]\n"
+      "fmla v25.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x9, #0x10]\n"
+      "fmla v10.8h, v29.8h, v0.h[1]\n"
+      "fmla v14.8h, v29.8h, v1.h[1]\n"
+      "fmla v18.8h, v29.8h, v2.h[1]\n"
+      "fmla v22.8h, v29.8h, v3.h[1]\n"
+      "fmla v26.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x12, #0x20]\n"
+      "fmla v11.8h, v28.8h, v0.h[1]\n"
+      "fmla v15.8h, v28.8h, v1.h[1]\n"
+      "fmla v19.8h, v28.8h, v2.h[1]\n"
+      "fmla v23.8h, v28.8h, v3.h[1]\n"
+      "fmla v27.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x11, #0x20]\n"
+      "fmla v8.8h, v29.8h, v0.h[2]\n"
+      "fmla v12.8h, v29.8h, v1.h[2]\n"
+      "fmla v16.8h, v29.8h, v2.h[2]\n"
+      "fmla v20.8h, v29.8h, v3.h[2]\n"
+      "fmla v24.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0x20]\n"
+      "fmla v9.8h, v28.8h, v0.h[2]\n"
+      "fmla v13.8h, v28.8h, v1.h[2]\n"
+      "fmla v17.8h, v28.8h, v2.h[2]\n"
+      "fmla v21.8h, v28.8h, v3.h[2]\n"
+      "fmla v25.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x9, #0x20]\n"
+      "fmla v10.8h, v29.8h, v0.h[2]\n"
+      "fmla v14.8h, v29.8h, v1.h[2]\n"
+      "fmla v18.8h, v29.8h, v2.h[2]\n"
+      "fmla v22.8h, v29.8h, v3.h[2]\n"
+      "fmla v26.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x12, #0x30]\n"
+      "fmla v11.8h, v28.8h, v0.h[2]\n"
+      "fmla v15.8h, v28.8h, v1.h[2]\n"
+      "fmla v19.8h, v28.8h, v2.h[2]\n"
+      "fmla v23.8h, v28.8h, v3.h[2]\n"
+      "fmla v27.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x11, #0x30]\n"
+      "fmla v8.8h, v29.8h, v0.h[3]\n"
+      "fmla v12.8h, v29.8h, v1.h[3]\n"
+      "fmla v16.8h, v29.8h, v2.h[3]\n"
+      "fmla v20.8h, v29.8h, v3.h[3]\n"
+      "fmla v24.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0x30]\n"
+      "fmla v9.8h, v28.8h, v0.h[3]\n"
+      "fmla v13.8h, v28.8h, v1.h[3]\n"
+      "fmla v17.8h, v28.8h, v2.h[3]\n"
+      "fmla v21.8h, v28.8h, v3.h[3]\n"
+      "fmla v25.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x9, #0x30]\n"
+      "fmla v10.8h, v29.8h, v0.h[3]\n"
+      "fmla v14.8h, v29.8h, v1.h[3]\n"
+      "fmla v18.8h, v29.8h, v2.h[3]\n"
+      "fmla v22.8h, v29.8h, v3.h[3]\n"
+      "fmla v26.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x12, #0x40]\n"
+      "fmla v11.8h, v28.8h, v0.h[3]\n"
+      "fmla v15.8h, v28.8h, v1.h[3]\n"
+      "fmla v19.8h, v28.8h, v2.h[3]\n"
+      "fmla v23.8h, v28.8h, v3.h[3]\n"
+      "fmla v27.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x11, #0x40]\n"
+      "fmla v8.8h, v29.8h, v0.h[4]\n"
+      "fmla v12.8h, v29.8h, v1.h[4]\n"
+      "fmla v16.8h, v29.8h, v2.h[4]\n"
+      "fmla v20.8h, v29.8h, v3.h[4]\n"
+      "fmla v24.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x10, #0x40]\n"
+      "fmla v9.8h, v28.8h, v0.h[4]\n"
+      "fmla v13.8h, v28.8h, v1.h[4]\n"
+      "fmla v17.8h, v28.8h, v2.h[4]\n"
+      "fmla v21.8h, v28.8h, v3.h[4]\n"
+      "fmla v25.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x9, #0x40]\n"
+      "fmla v10.8h, v29.8h, v0.h[4]\n"
+      "fmla v14.8h, v29.8h, v1.h[4]\n"
+      "fmla v18.8h, v29.8h, v2.h[4]\n"
+      "fmla v22.8h, v29.8h, v3.h[4]\n"
+      "fmla v26.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x12, #0x50]\n"
+      "fmla v11.8h, v28.8h, v0.h[4]\n"
+      "fmla v15.8h, v28.8h, v1.h[4]\n"
+      "fmla v19.8h, v28.8h, v2.h[4]\n"
+      "fmla v23.8h, v28.8h, v3.h[4]\n"
+      "fmla v27.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x11, #0x50]\n"
+      "fmla v8.8h, v29.8h, v0.h[5]\n"
+      "fmla v12.8h, v29.8h, v1.h[5]\n"
+      "fmla v16.8h, v29.8h, v2.h[5]\n"
+      "fmla v20.8h, v29.8h, v3.h[5]\n"
+      "fmla v24.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x10, #0x50]\n"
+      "fmla v9.8h, v28.8h, v0.h[5]\n"
+      "fmla v13.8h, v28.8h, v1.h[5]\n"
+      "fmla v17.8h, v28.8h, v2.h[5]\n"
+      "fmla v21.8h, v28.8h, v3.h[5]\n"
+      "fmla v25.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x9, #0x50]\n"
+      "fmla v10.8h, v29.8h, v0.h[5]\n"
+      "fmla v14.8h, v29.8h, v1.h[5]\n"
+      "fmla v18.8h, v29.8h, v2.h[5]\n"
+      "fmla v22.8h, v29.8h, v3.h[5]\n"
+      "fmla v26.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x12, #0x60]\n"
+      "fmla v11.8h, v28.8h, v0.h[5]\n"
+      "fmla v15.8h, v28.8h, v1.h[5]\n"
+      "fmla v19.8h, v28.8h, v2.h[5]\n"
+      "fmla v23.8h, v28.8h, v3.h[5]\n"
+      "fmla v27.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x11, #0x60]\n"
+      "fmla v8.8h, v29.8h, v0.h[6]\n"
+      "fmla v12.8h, v29.8h, v1.h[6]\n"
+      "fmla v16.8h, v29.8h, v2.h[6]\n"
+      "fmla v20.8h, v29.8h, v3.h[6]\n"
+      "fmla v24.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x10, #0x60]\n"
+      "fmla v9.8h, v28.8h, v0.h[6]\n"
+      "fmla v13.8h, v28.8h, v1.h[6]\n"
+      "fmla v17.8h, v28.8h, v2.h[6]\n"
+      "fmla v21.8h, v28.8h, v3.h[6]\n"
+      "fmla v25.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x9, #0x60]\n"
+      "fmla v10.8h, v29.8h, v0.h[6]\n"
+      "fmla v14.8h, v29.8h, v1.h[6]\n"
+      "fmla v18.8h, v29.8h, v2.h[6]\n"
+      "fmla v22.8h, v29.8h, v3.h[6]\n"
+      "fmla v26.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x12, #0x70]\n"
+      "fmla v11.8h, v28.8h, v0.h[6]\n"
       "add x12, x12, #0x80\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "fmla v27.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x11, #0x70]\n"
+      "fmla v15.8h, v28.8h, v1.h[6]\n"
+      "fmla v19.8h, v28.8h, v2.h[6]\n"
+      "fmla v23.8h, v28.8h, v3.h[6]\n"
+      "fmla v27.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x11, #0x70]\n"
       "add x11, x11, #0x80\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "fmla v24.8h, v6.8h, v4.h[7]\n"
-      "ldr q6, [x10, #0x70]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v8.8h, v29.8h, v0.h[7]\n"
+      "fmla v12.8h, v29.8h, v1.h[7]\n"
+      "fmla v16.8h, v29.8h, v2.h[7]\n"
+      "fmla v20.8h, v29.8h, v3.h[7]\n"
+      "fmla v24.8h, v29.8h, v4.h[7]\n"
+      "ldr q29, [x10, #0x70]\n"
+      "fmla v9.8h, v28.8h, v0.h[7]\n"
       "add x10, x10, #0x80\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "fmla v25.8h, v7.8h, v4.h[7]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "fmla v13.8h, v28.8h, v1.h[7]\n"
+      "fmla v17.8h, v28.8h, v2.h[7]\n"
+      "fmla v21.8h, v28.8h, v3.h[7]\n"
+      "fmla v25.8h, v28.8h, v4.h[7]\n"
+      "ldr q28, [x9, #0x70]\n"
       "add x9, x9, #0x80\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v26.8h, v6.8h, v4.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
-      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "fmla v10.8h, v29.8h, v0.h[7]\n"
+      "fmla v14.8h, v29.8h, v1.h[7]\n"
+      "fmla v18.8h, v29.8h, v2.h[7]\n"
+      "fmla v22.8h, v29.8h, v3.h[7]\n"
+      "fmla v26.8h, v29.8h, v4.h[7]\n"
+      "fmla v11.8h, v28.8h, v0.h[7]\n"
+      "fmla v15.8h, v28.8h, v1.h[7]\n"
+      "fmla v19.8h, v28.8h, v2.h[7]\n"
+      "fmla v23.8h, v28.8h, v3.h[7]\n"
+      "fmla v27.8h, v28.8h, v4.h[7]\n"
       "229:"  // Height 5: Multiply loop: Main loop skip
       "cbz x27, 231f\n"
       "230:"  // Height 5: Multiply loop: Odd block loop
-      "ldr h0, [x26], #0x2\n"
-      "ldr h1, [x25], #0x2\n"
+      "ldr h4, [x26], #0x2\n"
+      "ldr h3, [x25], #0x2\n"
       "sub x27, x27, #0x1\n"
       "ldr h2, [x24], #0x2\n"
-      "ldr h3, [x23], #0x2\n"
-      "ldr h4, [x22], #0x2\n"
-      "ldr q6, [x12, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q7, [x11, #0x0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "ldr h1, [x23], #0x2\n"
+      "ldr h0, [x22], #0x2\n"
+      "ldr q29, [x12, #0x0]\n"
+      "fmla v8.8h, v29.8h, v4.h[0]\n"
+      "fmla v12.8h, v29.8h, v3.h[0]\n"
+      "ldr q28, [x11, #0x0]\n"
+      "fmla v16.8h, v29.8h, v2.h[0]\n"
+      "fmla v20.8h, v29.8h, v1.h[0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v24.8h, v29.8h, v0.h[0]\n"
+      "ldr q29, [x10, #0x0]\n"
+      "fmla v9.8h, v28.8h, v4.h[0]\n"
       "add x11, x11, #0x10\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "fmla v13.8h, v28.8h, v3.h[0]\n"
+      "fmla v17.8h, v28.8h, v2.h[0]\n"
       "add x10, x10, #0x10\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "fmla v21.8h, v28.8h, v1.h[0]\n"
+      "fmla v25.8h, v28.8h, v0.h[0]\n"
+      "ldr q28, [x9, #0x0]\n"
       "add x9, x9, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "fmla v10.8h, v29.8h, v4.h[0]\n"
+      "fmla v14.8h, v29.8h, v3.h[0]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "fmla v22.8h, v29.8h, v1.h[0]\n"
+      "fmla v26.8h, v29.8h, v0.h[0]\n"
+      "fmla v11.8h, v28.8h, v4.h[0]\n"
+      "fmla v15.8h, v28.8h, v3.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v1.h[0]\n"
+      "fmla v27.8h, v28.8h, v0.h[0]\n"
       "cbnz x27, 230b\n"
       "231:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -3812,49 +3812,49 @@
       "add x22, x23, x20, LSL #1\n"
       "tbz %x[flags], #1, 232f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
+      "ld1r { v29.8h }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v1.8h\n"
-      "fmin v9.8h, v9.8h, v1.8h\n"
-      "fmin v10.8h, v10.8h, v1.8h\n"
-      "fmin v11.8h, v11.8h, v1.8h\n"
-      "fmin v12.8h, v12.8h, v1.8h\n"
-      "fmin v13.8h, v13.8h, v1.8h\n"
-      "fmin v14.8h, v14.8h, v1.8h\n"
-      "fmin v15.8h, v15.8h, v1.8h\n"
-      "fmin v16.8h, v16.8h, v1.8h\n"
-      "fmin v17.8h, v17.8h, v1.8h\n"
-      "fmin v18.8h, v18.8h, v1.8h\n"
-      "fmin v19.8h, v19.8h, v1.8h\n"
-      "fmin v20.8h, v20.8h, v1.8h\n"
-      "fmin v21.8h, v21.8h, v1.8h\n"
-      "fmin v22.8h, v22.8h, v1.8h\n"
-      "fmin v23.8h, v23.8h, v1.8h\n"
-      "fmin v24.8h, v24.8h, v1.8h\n"
-      "fmin v25.8h, v25.8h, v1.8h\n"
-      "fmin v26.8h, v26.8h, v1.8h\n"
-      "fmin v27.8h, v27.8h, v1.8h\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v0.8h\n"
-      "fmax v13.8h, v13.8h, v0.8h\n"
-      "fmax v14.8h, v14.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v0.8h\n"
-      "fmax v16.8h, v16.8h, v0.8h\n"
-      "fmax v17.8h, v17.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v0.8h\n"
-      "fmax v19.8h, v19.8h, v0.8h\n"
-      "fmax v20.8h, v20.8h, v0.8h\n"
-      "fmax v21.8h, v21.8h, v0.8h\n"
-      "fmax v22.8h, v22.8h, v0.8h\n"
-      "fmax v23.8h, v23.8h, v0.8h\n"
-      "fmax v24.8h, v24.8h, v0.8h\n"
-      "fmax v25.8h, v25.8h, v0.8h\n"
-      "fmax v26.8h, v26.8h, v0.8h\n"
-      "fmax v27.8h, v27.8h, v0.8h\n"
+      "ld1r { v28.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v29.8h\n"
+      "fmin v9.8h, v9.8h, v29.8h\n"
+      "fmin v10.8h, v10.8h, v29.8h\n"
+      "fmin v11.8h, v11.8h, v29.8h\n"
+      "fmin v12.8h, v12.8h, v29.8h\n"
+      "fmin v13.8h, v13.8h, v29.8h\n"
+      "fmin v14.8h, v14.8h, v29.8h\n"
+      "fmin v15.8h, v15.8h, v29.8h\n"
+      "fmin v16.8h, v16.8h, v29.8h\n"
+      "fmin v17.8h, v17.8h, v29.8h\n"
+      "fmin v18.8h, v18.8h, v29.8h\n"
+      "fmin v19.8h, v19.8h, v29.8h\n"
+      "fmin v20.8h, v20.8h, v29.8h\n"
+      "fmin v21.8h, v21.8h, v29.8h\n"
+      "fmin v22.8h, v22.8h, v29.8h\n"
+      "fmin v23.8h, v23.8h, v29.8h\n"
+      "fmin v24.8h, v24.8h, v29.8h\n"
+      "fmin v25.8h, v25.8h, v29.8h\n"
+      "fmin v26.8h, v26.8h, v29.8h\n"
+      "fmin v27.8h, v27.8h, v29.8h\n"
+      "fmax v8.8h, v8.8h, v28.8h\n"
+      "fmax v9.8h, v9.8h, v28.8h\n"
+      "fmax v10.8h, v10.8h, v28.8h\n"
+      "fmax v11.8h, v11.8h, v28.8h\n"
+      "fmax v12.8h, v12.8h, v28.8h\n"
+      "fmax v13.8h, v13.8h, v28.8h\n"
+      "fmax v14.8h, v14.8h, v28.8h\n"
+      "fmax v15.8h, v15.8h, v28.8h\n"
+      "fmax v16.8h, v16.8h, v28.8h\n"
+      "fmax v17.8h, v17.8h, v28.8h\n"
+      "fmax v18.8h, v18.8h, v28.8h\n"
+      "fmax v19.8h, v19.8h, v28.8h\n"
+      "fmax v20.8h, v20.8h, v28.8h\n"
+      "fmax v21.8h, v21.8h, v28.8h\n"
+      "fmax v22.8h, v22.8h, v28.8h\n"
+      "fmax v23.8h, v23.8h, v28.8h\n"
+      "fmax v24.8h, v24.8h, v28.8h\n"
+      "fmax v25.8h, v25.8h, v28.8h\n"
+      "fmax v26.8h, v26.8h, v28.8h\n"
+      "fmax v27.8h, v27.8h, v28.8h\n"
       "232:"  // Height 5: No activation
       "cmp x14, #0x20\n"
       "bge 249f\n"
@@ -4497,16 +4497,16 @@
       "274:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 275f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 276f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -4518,11 +4518,11 @@
       "b 276f\n"
       "275:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "276:"  // Height 6: input setup done
       "cmp x27, #0x8\n"
       "blt 279f\n"
@@ -5017,45 +5017,45 @@
       "279:"  // Height 6: Multiply loop: Main loop skip
       "cbz x27, 281f\n"
       "280:"  // Height 6: Multiply loop: Odd block loop
-      "ldr h0, [x26], #0x2\n"
-      "ldr h1, [x25], #0x2\n"
+      "ldr h7, [x26], #0x2\n"
+      "ldr h6, [x25], #0x2\n"
       "sub x27, x27, #0x1\n"
-      "ldr h2, [x24], #0x2\n"
-      "ldr h3, [x23], #0x2\n"
-      "ldr h4, [x22], #0x2\n"
-      "ldr h5, [x21], #0x2\n"
-      "ldr q6, [x12, #0x0]\n"
-      "ldr q7, [x11, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "ldr h5, [x24], #0x2\n"
+      "ldr h4, [x23], #0x2\n"
+      "ldr h3, [x22], #0x2\n"
+      "ldr h2, [x21], #0x2\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q0, [x11, #0x0]\n"
+      "fmla v8.8h, v1.8h, v7.h[0]\n"
+      "fmla v12.8h, v1.8h, v6.h[0]\n"
+      "fmla v16.8h, v1.8h, v5.h[0]\n"
+      "fmla v20.8h, v1.8h, v4.h[0]\n"
       "add x12, x12, #0x10\n"
       "add x11, x11, #0x10\n"
-      "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "fmla v28.8h, v6.8h, v5.h[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "fmla v24.8h, v1.8h, v3.h[0]\n"
+      "fmla v28.8h, v1.8h, v2.h[0]\n"
+      "ldr q1, [x10, #0x0]\n"
       "add x10, x10, #0x10\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "fmla v29.8h, v7.8h, v5.h[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "fmla v9.8h, v0.8h, v7.h[0]\n"
+      "fmla v13.8h, v0.8h, v6.h[0]\n"
+      "fmla v17.8h, v0.8h, v5.h[0]\n"
+      "fmla v21.8h, v0.8h, v4.h[0]\n"
+      "fmla v25.8h, v0.8h, v3.h[0]\n"
+      "fmla v29.8h, v0.8h, v2.h[0]\n"
+      "ldr q0, [x9, #0x0]\n"
       "add x9, x9, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "fmla v30.8h, v6.8h, v5.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "fmla v31.8h, v7.8h, v5.h[0]\n"
+      "fmla v10.8h, v1.8h, v7.h[0]\n"
+      "fmla v14.8h, v1.8h, v6.h[0]\n"
+      "fmla v18.8h, v1.8h, v5.h[0]\n"
+      "fmla v22.8h, v1.8h, v4.h[0]\n"
+      "fmla v26.8h, v1.8h, v3.h[0]\n"
+      "fmla v30.8h, v1.8h, v2.h[0]\n"
+      "fmla v11.8h, v0.8h, v7.h[0]\n"
+      "fmla v15.8h, v0.8h, v6.h[0]\n"
+      "fmla v19.8h, v0.8h, v5.h[0]\n"
+      "fmla v23.8h, v0.8h, v4.h[0]\n"
+      "fmla v27.8h, v0.8h, v3.h[0]\n"
+      "fmla v31.8h, v0.8h, v2.h[0]\n"
       "cbnz x27, 280b\n"
       "281:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp
index 08f5aeb..94fb84e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp
index e0fbe17..b1cd6dc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp
@@ -209,11 +209,11 @@
       "16:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 17f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 18f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -230,37 +230,37 @@
       "blt 20f\n"
       "19:"  // Height 1: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x0]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x11, #0x30]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "ldr q16, [x11, #0x10]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x12, #0x30]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x11, #0x30]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x10, #0x30]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x9, #0x30]\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x8\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
       "add x26, x26, #0x10\n"
       "ldr q0, [x26, #0x0]\n"
       "add x12, x12, #0x40\n"
@@ -272,36 +272,36 @@
       "bge 19b\n"
       "20:"  // Height 1: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x0]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x11, #0x30]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "ldr q17, [x12, #0x10]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "ldr q16, [x11, #0x10]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x12, #0x30]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x11, #0x30]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x10, #0x30]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x9, #0x30]\n"
       "sub x27, x27, #0x4\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
       "add x26, x26, #0x10\n"
       "add x12, x12, #0x40\n"
       "add x11, x11, #0x40\n"
@@ -310,16 +310,16 @@
       "21:"  // Height 1: Multiply loop: Main loop skip
       "cbz x27, 23f\n"
       "22:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr q6, [x12, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr s18, [x26], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v8.4s, v16.4s, v18.s[0]\n"
       "sub x27, x27, #0x1\n"
-      "ldr q7, [x11, #0x0]\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr q17, [x11, #0x0]\n"
+      "ldr q16, [x10, #0x0]\n"
+      "fmla v9.4s, v17.4s, v18.s[0]\n"
+      "fmla v10.4s, v16.4s, v18.s[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v11.4s, v16.4s, v18.s[0]\n"
       "add x12, x12, #0x10\n"
       "add x11, x11, #0x10\n"
       "add x10, x10, #0x10\n"
@@ -332,17 +332,17 @@
       "bne 16b\n"
       "tbz %x[flags], #1, 24f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v17.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v17.4s\n"
+      "fmin v9.4s, v9.4s, v17.4s\n"
+      "fmin v10.4s, v10.4s, v17.4s\n"
+      "fmin v11.4s, v11.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
       "24:"  // Height 1: No activation
       "cmp x14, #0x10\n"
       "bge 33f\n"
@@ -538,12 +538,12 @@
       "50:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 51f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 52f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -551,7 +551,7 @@
       "b 52f\n"
       "51:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
       "52:"  // Height 2: input setup done
       "cmp x27, #0x4\n"
       "blt 55f\n"
@@ -564,137 +564,137 @@
       "53:"  // Height 2: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x0]\n"
       "sub x27, x27, #0x4\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "ldr q16, [x9, #0x0]\n"
       "cmp x27, #0x8\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x12, #0x10]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "fmla v14.4s, v17.4s, v1.s[0]\n"
+      "ldr q17, [x12, #0x10]\n"
       "add x26, x26, #0x10\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x11, #0x10]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "fmla v15.4s, v16.4s, v1.s[0]\n"
+      "ldr q16, [x11, #0x10]\n"
       "add x25, x25, #0x10\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x12, #0x30]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "fmla v12.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "fmla v13.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "fmla v14.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "fmla v15.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "fmla v12.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "fmla v13.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "fmla v14.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x12, #0x30]\n"
       "add x12, x12, #0x40\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x11, #0x30]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "fmla v15.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x11, #0x30]\n"
       "add x11, x11, #0x40\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "ldr q6, [x10, #0x30]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "fmla v12.4s, v17.4s, v1.s[3]\n"
+      "ldr q17, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v1.s[3]\n"
+      "ldr q16, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v14.4s, v17.4s, v1.s[3]\n"
       "ldr q6, [x12, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v15.4s, v16.4s, v1.s[3]\n"
       "ldr q1, [x25, #0x0]\n"
       "ldr q7, [x11, #0x0]\n"
       "bge 53b\n"
       "54:"  // Height 2: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x0]\n"
       "sub x27, x27, #0x4\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "ldr q16, [x9, #0x0]\n"
       "add x26, x26, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x12, #0x10]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "fmla v14.4s, v17.4s, v1.s[0]\n"
+      "ldr q17, [x12, #0x10]\n"
       "add x25, x25, #0x10\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x12, #0x30]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "fmla v15.4s, v16.4s, v1.s[0]\n"
+      "ldr q16, [x11, #0x10]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "fmla v12.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "fmla v13.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "fmla v14.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x12, #0x20]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "fmla v15.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x11, #0x20]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "fmla v12.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "fmla v13.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x9, #0x20]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "fmla v14.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x12, #0x30]\n"
       "add x12, x12, #0x40\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x11, #0x30]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "fmla v15.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x11, #0x30]\n"
       "add x11, x11, #0x40\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "ldr q6, [x10, #0x30]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "fmla v12.4s, v17.4s, v1.s[3]\n"
+      "ldr q17, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v1.s[3]\n"
+      "ldr q16, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v14.4s, v17.4s, v1.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
+      "fmla v15.4s, v16.4s, v1.s[3]\n"
       "55:"  // Height 2: Multiply loop: Main loop skip
       "cbz x27, 57f\n"
       "56:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s19, [x26], #0x4\n"
+      "ldr s18, [x25], #0x4\n"
       "sub x27, x27, #0x1\n"
-      "ldr q6, [x12, #0x0]\n"
-      "ldr q7, [x11, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "ldr q17, [x12, #0x0]\n"
+      "ldr q16, [x11, #0x0]\n"
+      "fmla v8.4s, v17.4s, v19.s[0]\n"
+      "fmla v12.4s, v17.4s, v18.s[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "fmla v9.4s, v16.4s, v19.s[0]\n"
+      "fmla v13.4s, v16.4s, v18.s[0]\n"
+      "ldr q16, [x9, #0x0]\n"
+      "fmla v10.4s, v17.4s, v19.s[0]\n"
+      "fmla v14.4s, v17.4s, v18.s[0]\n"
       "add x12, x12, #0x10\n"
       "add x11, x11, #0x10\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v11.4s, v16.4s, v19.s[0]\n"
+      "fmla v15.4s, v16.4s, v18.s[0]\n"
       "add x10, x10, #0x10\n"
       "add x9, x9, #0x10\n"
       "cbnz x27, 56b\n"
@@ -707,25 +707,25 @@
       "add x25, x13, x20, LSL #2\n"
       "tbz %x[flags], #1, 58f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v17.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v17.4s\n"
+      "fmin v9.4s, v9.4s, v17.4s\n"
+      "fmin v10.4s, v10.4s, v17.4s\n"
+      "fmin v11.4s, v11.4s, v17.4s\n"
+      "fmin v12.4s, v12.4s, v17.4s\n"
+      "fmin v13.4s, v13.4s, v17.4s\n"
+      "fmin v14.4s, v14.4s, v17.4s\n"
+      "fmin v15.4s, v15.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
+      "fmax v12.4s, v12.4s, v16.4s\n"
+      "fmax v13.4s, v13.4s, v16.4s\n"
+      "fmax v14.4s, v14.4s, v16.4s\n"
+      "fmax v15.4s, v15.4s, v16.4s\n"
       "58:"  // Height 2: No activation
       "cmp x14, #0x10\n"
       "bge 67f\n"
@@ -970,13 +970,13 @@
       "84:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 85f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 86f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -985,8 +985,8 @@
       "b 86f\n"
       "85:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "86:"  // Height 3: input setup done
       "cmp x27, #0x4\n"
       "blt 89f\n"
@@ -1003,75 +1003,75 @@
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x8\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q21, [x10, #0x0]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "add x26, x26, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "ldr q20, [x9, #0x0]\n"
       "add x25, x25, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v10.4s, v21.4s, v0.s[0]\n"
+      "fmla v14.4s, v21.4s, v1.s[0]\n"
       "add x24, x24, #0x10\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v18.4s, v21.4s, v2.s[0]\n"
+      "ldr q21, [x12, #0x10]\n"
+      "fmla v11.4s, v20.4s, v0.s[0]\n"
+      "fmla v15.4s, v20.4s, v1.s[0]\n"
+      "fmla v19.4s, v20.4s, v2.s[0]\n"
+      "ldr q20, [x11, #0x10]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v12.4s, v21.4s, v1.s[1]\n"
+      "fmla v16.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x10, #0x10]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v13.4s, v20.4s, v1.s[1]\n"
+      "fmla v17.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x9, #0x10]\n"
+      "fmla v10.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "fmla v18.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x12, #0x20]\n"
+      "fmla v11.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "fmla v19.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x11, #0x20]\n"
+      "fmla v8.4s, v21.4s, v0.s[2]\n"
+      "fmla v12.4s, v21.4s, v1.s[2]\n"
+      "fmla v16.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x10, #0x20]\n"
+      "fmla v9.4s, v20.4s, v0.s[2]\n"
+      "fmla v13.4s, v20.4s, v1.s[2]\n"
+      "fmla v17.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x9, #0x20]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v14.4s, v21.4s, v1.s[2]\n"
+      "fmla v18.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x12, #0x30]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
       "add x12, x12, #0x40\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x11, #0x30]\n"
+      "fmla v15.4s, v20.4s, v1.s[2]\n"
+      "fmla v19.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x11, #0x30]\n"
       "add x11, x11, #0x40\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v12.4s, v21.4s, v1.s[3]\n"
+      "fmla v16.4s, v21.4s, v2.s[3]\n"
+      "ldr q21, [x10, #0x30]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
       "add x10, x10, #0x40\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "fmla v13.4s, v20.4s, v1.s[3]\n"
+      "fmla v17.4s, v20.4s, v2.s[3]\n"
+      "ldr q20, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v10.4s, v21.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "fmla v18.4s, v21.4s, v2.s[3]\n"
       "ldr q6, [x12, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v11.4s, v20.4s, v0.s[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v19.4s, v20.4s, v2.s[3]\n"
       "ldr q2, [x24, #0x0]\n"
       "ldr q7, [x11, #0x0]\n"
       "bge 87b\n"
@@ -1081,98 +1081,98 @@
       "sub x27, x27, #0x4\n"
       "add x26, x26, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q21, [x10, #0x0]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "add x25, x25, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "ldr q20, [x9, #0x0]\n"
       "add x24, x24, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v10.4s, v21.4s, v0.s[0]\n"
+      "fmla v14.4s, v21.4s, v1.s[0]\n"
+      "fmla v18.4s, v21.4s, v2.s[0]\n"
+      "ldr q21, [x12, #0x10]\n"
+      "fmla v11.4s, v20.4s, v0.s[0]\n"
+      "fmla v15.4s, v20.4s, v1.s[0]\n"
+      "fmla v19.4s, v20.4s, v2.s[0]\n"
+      "ldr q20, [x11, #0x10]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v12.4s, v21.4s, v1.s[1]\n"
+      "fmla v16.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x10, #0x10]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v13.4s, v20.4s, v1.s[1]\n"
+      "fmla v17.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x9, #0x10]\n"
+      "fmla v10.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "fmla v18.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x12, #0x20]\n"
+      "fmla v11.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "fmla v19.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x11, #0x20]\n"
+      "fmla v8.4s, v21.4s, v0.s[2]\n"
+      "fmla v12.4s, v21.4s, v1.s[2]\n"
+      "fmla v16.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x10, #0x20]\n"
+      "fmla v9.4s, v20.4s, v0.s[2]\n"
+      "fmla v13.4s, v20.4s, v1.s[2]\n"
+      "fmla v17.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x9, #0x20]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v14.4s, v21.4s, v1.s[2]\n"
+      "fmla v18.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x12, #0x30]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
       "add x12, x12, #0x40\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x11, #0x30]\n"
+      "fmla v15.4s, v20.4s, v1.s[2]\n"
+      "fmla v19.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x11, #0x30]\n"
       "add x11, x11, #0x40\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v12.4s, v21.4s, v1.s[3]\n"
+      "fmla v16.4s, v21.4s, v2.s[3]\n"
+      "ldr q21, [x10, #0x30]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
       "add x10, x10, #0x40\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "fmla v13.4s, v20.4s, v1.s[3]\n"
+      "fmla v17.4s, v20.4s, v2.s[3]\n"
+      "ldr q20, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v10.4s, v21.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "fmla v18.4s, v21.4s, v2.s[3]\n"
+      "fmla v11.4s, v20.4s, v0.s[3]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "fmla v19.4s, v20.4s, v2.s[3]\n"
       "89:"  // Height 3: Multiply loop: Main loop skip
       "cbz x27, 91f\n"
       "90:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
       "sub x27, x27, #0x1\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr q6, [x12, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q7, [x11, #0x0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr q21, [x12, #0x0]\n"
+      "fmla v8.4s, v21.4s, v24.s[0]\n"
+      "fmla v12.4s, v21.4s, v23.s[0]\n"
+      "ldr q20, [x11, #0x0]\n"
+      "fmla v16.4s, v21.4s, v22.s[0]\n"
+      "ldr q21, [x10, #0x0]\n"
+      "fmla v9.4s, v20.4s, v24.s[0]\n"
+      "fmla v13.4s, v20.4s, v23.s[0]\n"
+      "fmla v17.4s, v20.4s, v22.s[0]\n"
+      "ldr q20, [x9, #0x0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v10.4s, v21.4s, v24.s[0]\n"
+      "fmla v14.4s, v21.4s, v23.s[0]\n"
       "add x11, x11, #0x10\n"
       "add x10, x10, #0x10\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v18.4s, v21.4s, v22.s[0]\n"
+      "fmla v11.4s, v20.4s, v24.s[0]\n"
       "add x9, x9, #0x10\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v15.4s, v20.4s, v23.s[0]\n"
+      "fmla v19.4s, v20.4s, v22.s[0]\n"
       "cbnz x27, 90b\n"
       "91:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1184,33 +1184,33 @@
       "add x24, x25, x20, LSL #2\n"
       "tbz %x[flags], #1, 92f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v21.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v21.4s\n"
+      "fmin v9.4s, v9.4s, v21.4s\n"
+      "fmin v10.4s, v10.4s, v21.4s\n"
+      "fmin v11.4s, v11.4s, v21.4s\n"
+      "fmin v12.4s, v12.4s, v21.4s\n"
+      "fmin v13.4s, v13.4s, v21.4s\n"
+      "fmin v14.4s, v14.4s, v21.4s\n"
+      "fmin v15.4s, v15.4s, v21.4s\n"
+      "fmin v16.4s, v16.4s, v21.4s\n"
+      "fmin v17.4s, v17.4s, v21.4s\n"
+      "fmin v18.4s, v18.4s, v21.4s\n"
+      "fmin v19.4s, v19.4s, v21.4s\n"
+      "fmax v8.4s, v8.4s, v20.4s\n"
+      "fmax v9.4s, v9.4s, v20.4s\n"
+      "fmax v10.4s, v10.4s, v20.4s\n"
+      "fmax v11.4s, v11.4s, v20.4s\n"
+      "fmax v12.4s, v12.4s, v20.4s\n"
+      "fmax v13.4s, v13.4s, v20.4s\n"
+      "fmax v14.4s, v14.4s, v20.4s\n"
+      "fmax v15.4s, v15.4s, v20.4s\n"
+      "fmax v16.4s, v16.4s, v20.4s\n"
+      "fmax v17.4s, v17.4s, v20.4s\n"
+      "fmax v18.4s, v18.4s, v20.4s\n"
+      "fmax v19.4s, v19.4s, v20.4s\n"
       "92:"  // Height 3: No activation
       "cmp x14, #0x10\n"
       "bge 101f\n"
@@ -1504,14 +1504,14 @@
       "118:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 119f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 120f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -1521,9 +1521,9 @@
       "b 120f\n"
       "119:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "120:"  // Height 4: input setup done
       "cmp x27, #0x4\n"
       "blt 123f\n"
@@ -1542,7 +1542,7 @@
       "cmp x27, #0x8\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x0]\n"
       "add x26, x26, #0x10\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -1550,84 +1550,84 @@
       "add x24, x24, #0x10\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "ldr q24, [x9, #0x0]\n"
       "add x23, x23, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x12, #0x30]\n"
+      "fmla v10.4s, v25.4s, v0.s[0]\n"
+      "fmla v14.4s, v25.4s, v1.s[0]\n"
+      "fmla v18.4s, v25.4s, v2.s[0]\n"
+      "fmla v22.4s, v25.4s, v3.s[0]\n"
+      "ldr q25, [x12, #0x10]\n"
+      "fmla v11.4s, v24.4s, v0.s[0]\n"
+      "fmla v15.4s, v24.4s, v1.s[0]\n"
+      "fmla v19.4s, v24.4s, v2.s[0]\n"
+      "fmla v23.4s, v24.4s, v3.s[0]\n"
+      "ldr q24, [x11, #0x10]\n"
+      "fmla v8.4s, v25.4s, v0.s[1]\n"
+      "fmla v12.4s, v25.4s, v1.s[1]\n"
+      "fmla v16.4s, v25.4s, v2.s[1]\n"
+      "fmla v20.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x10, #0x10]\n"
+      "fmla v9.4s, v24.4s, v0.s[1]\n"
+      "fmla v13.4s, v24.4s, v1.s[1]\n"
+      "fmla v17.4s, v24.4s, v2.s[1]\n"
+      "fmla v21.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x9, #0x10]\n"
+      "fmla v10.4s, v25.4s, v0.s[1]\n"
+      "fmla v14.4s, v25.4s, v1.s[1]\n"
+      "fmla v18.4s, v25.4s, v2.s[1]\n"
+      "fmla v22.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x12, #0x20]\n"
+      "fmla v11.4s, v24.4s, v0.s[1]\n"
+      "fmla v15.4s, v24.4s, v1.s[1]\n"
+      "fmla v19.4s, v24.4s, v2.s[1]\n"
+      "fmla v23.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x11, #0x20]\n"
+      "fmla v8.4s, v25.4s, v0.s[2]\n"
+      "fmla v12.4s, v25.4s, v1.s[2]\n"
+      "fmla v16.4s, v25.4s, v2.s[2]\n"
+      "fmla v20.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x10, #0x20]\n"
+      "fmla v9.4s, v24.4s, v0.s[2]\n"
+      "fmla v13.4s, v24.4s, v1.s[2]\n"
+      "fmla v17.4s, v24.4s, v2.s[2]\n"
+      "fmla v21.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x9, #0x20]\n"
+      "fmla v10.4s, v25.4s, v0.s[2]\n"
+      "fmla v14.4s, v25.4s, v1.s[2]\n"
+      "fmla v18.4s, v25.4s, v2.s[2]\n"
+      "fmla v22.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x12, #0x30]\n"
       "add x12, x12, #0x40\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x11, #0x30]\n"
+      "fmla v11.4s, v24.4s, v0.s[2]\n"
+      "fmla v15.4s, v24.4s, v1.s[2]\n"
+      "fmla v19.4s, v24.4s, v2.s[2]\n"
+      "fmla v23.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x11, #0x30]\n"
       "add x11, x11, #0x40\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "ldr q6, [x10, #0x30]\n"
+      "fmla v8.4s, v25.4s, v0.s[3]\n"
+      "fmla v12.4s, v25.4s, v1.s[3]\n"
+      "fmla v16.4s, v25.4s, v2.s[3]\n"
+      "fmla v20.4s, v25.4s, v3.s[3]\n"
+      "ldr q25, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "fmla v9.4s, v24.4s, v0.s[3]\n"
+      "fmla v13.4s, v24.4s, v1.s[3]\n"
+      "fmla v17.4s, v24.4s, v2.s[3]\n"
+      "fmla v21.4s, v24.4s, v3.s[3]\n"
+      "ldr q24, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v10.4s, v25.4s, v0.s[3]\n"
+      "fmla v14.4s, v25.4s, v1.s[3]\n"
+      "fmla v18.4s, v25.4s, v2.s[3]\n"
+      "fmla v22.4s, v25.4s, v3.s[3]\n"
       "ldr q6, [x12, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v11.4s, v24.4s, v0.s[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v15.4s, v24.4s, v1.s[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v19.4s, v24.4s, v2.s[3]\n"
       "ldr q2, [x24, #0x0]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v23.4s, v24.4s, v3.s[3]\n"
       "ldr q3, [x23, #0x0]\n"
       "ldr q7, [x11, #0x0]\n"
       "bge 121b\n"
@@ -1638,7 +1638,7 @@
       "add x26, x26, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x0]\n"
       "add x25, x25, #0x10\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -1646,111 +1646,111 @@
       "add x23, x23, #0x10\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x12, #0x30]\n"
+      "ldr q24, [x9, #0x0]\n"
+      "fmla v10.4s, v25.4s, v0.s[0]\n"
+      "fmla v14.4s, v25.4s, v1.s[0]\n"
+      "fmla v18.4s, v25.4s, v2.s[0]\n"
+      "fmla v22.4s, v25.4s, v3.s[0]\n"
+      "ldr q25, [x12, #0x10]\n"
+      "fmla v11.4s, v24.4s, v0.s[0]\n"
+      "fmla v15.4s, v24.4s, v1.s[0]\n"
+      "fmla v19.4s, v24.4s, v2.s[0]\n"
+      "fmla v23.4s, v24.4s, v3.s[0]\n"
+      "ldr q24, [x11, #0x10]\n"
+      "fmla v8.4s, v25.4s, v0.s[1]\n"
+      "fmla v12.4s, v25.4s, v1.s[1]\n"
+      "fmla v16.4s, v25.4s, v2.s[1]\n"
+      "fmla v20.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x10, #0x10]\n"
+      "fmla v9.4s, v24.4s, v0.s[1]\n"
+      "fmla v13.4s, v24.4s, v1.s[1]\n"
+      "fmla v17.4s, v24.4s, v2.s[1]\n"
+      "fmla v21.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x9, #0x10]\n"
+      "fmla v10.4s, v25.4s, v0.s[1]\n"
+      "fmla v14.4s, v25.4s, v1.s[1]\n"
+      "fmla v18.4s, v25.4s, v2.s[1]\n"
+      "fmla v22.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x12, #0x20]\n"
+      "fmla v11.4s, v24.4s, v0.s[1]\n"
+      "fmla v15.4s, v24.4s, v1.s[1]\n"
+      "fmla v19.4s, v24.4s, v2.s[1]\n"
+      "fmla v23.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x11, #0x20]\n"
+      "fmla v8.4s, v25.4s, v0.s[2]\n"
+      "fmla v12.4s, v25.4s, v1.s[2]\n"
+      "fmla v16.4s, v25.4s, v2.s[2]\n"
+      "fmla v20.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x10, #0x20]\n"
+      "fmla v9.4s, v24.4s, v0.s[2]\n"
+      "fmla v13.4s, v24.4s, v1.s[2]\n"
+      "fmla v17.4s, v24.4s, v2.s[2]\n"
+      "fmla v21.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x9, #0x20]\n"
+      "fmla v10.4s, v25.4s, v0.s[2]\n"
+      "fmla v14.4s, v25.4s, v1.s[2]\n"
+      "fmla v18.4s, v25.4s, v2.s[2]\n"
+      "fmla v22.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x12, #0x30]\n"
       "add x12, x12, #0x40\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x11, #0x30]\n"
+      "fmla v11.4s, v24.4s, v0.s[2]\n"
+      "fmla v15.4s, v24.4s, v1.s[2]\n"
+      "fmla v19.4s, v24.4s, v2.s[2]\n"
+      "fmla v23.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x11, #0x30]\n"
       "add x11, x11, #0x40\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "ldr q6, [x10, #0x30]\n"
+      "fmla v8.4s, v25.4s, v0.s[3]\n"
+      "fmla v12.4s, v25.4s, v1.s[3]\n"
+      "fmla v16.4s, v25.4s, v2.s[3]\n"
+      "fmla v20.4s, v25.4s, v3.s[3]\n"
+      "ldr q25, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "fmla v9.4s, v24.4s, v0.s[3]\n"
+      "fmla v13.4s, v24.4s, v1.s[3]\n"
+      "fmla v17.4s, v24.4s, v2.s[3]\n"
+      "fmla v21.4s, v24.4s, v3.s[3]\n"
+      "ldr q24, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v10.4s, v25.4s, v0.s[3]\n"
+      "fmla v14.4s, v25.4s, v1.s[3]\n"
+      "fmla v18.4s, v25.4s, v2.s[3]\n"
+      "fmla v22.4s, v25.4s, v3.s[3]\n"
+      "fmla v11.4s, v24.4s, v0.s[3]\n"
+      "fmla v15.4s, v24.4s, v1.s[3]\n"
+      "fmla v19.4s, v24.4s, v2.s[3]\n"
+      "fmla v23.4s, v24.4s, v3.s[3]\n"
       "123:"  // Height 4: Multiply loop: Main loop skip
       "cbz x27, 125f\n"
       "124:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
       "sub x27, x27, #0x1\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr q6, [x12, #0x0]\n"
-      "ldr q7, [x11, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr q25, [x12, #0x0]\n"
+      "ldr q24, [x11, #0x0]\n"
+      "fmla v8.4s, v25.4s, v29.s[0]\n"
+      "fmla v12.4s, v25.4s, v28.s[0]\n"
+      "fmla v16.4s, v25.4s, v27.s[0]\n"
+      "fmla v20.4s, v25.4s, v26.s[0]\n"
+      "ldr q25, [x10, #0x0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "fmla v9.4s, v24.4s, v29.s[0]\n"
+      "fmla v13.4s, v24.4s, v28.s[0]\n"
       "add x11, x11, #0x10\n"
       "add x10, x10, #0x10\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "fmla v17.4s, v24.4s, v27.s[0]\n"
+      "fmla v21.4s, v24.4s, v26.s[0]\n"
+      "ldr q24, [x9, #0x0]\n"
       "add x9, x9, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v10.4s, v25.4s, v29.s[0]\n"
+      "fmla v14.4s, v25.4s, v28.s[0]\n"
+      "fmla v18.4s, v25.4s, v27.s[0]\n"
+      "fmla v22.4s, v25.4s, v26.s[0]\n"
+      "fmla v11.4s, v24.4s, v29.s[0]\n"
+      "fmla v15.4s, v24.4s, v28.s[0]\n"
+      "fmla v19.4s, v24.4s, v27.s[0]\n"
+      "fmla v23.4s, v24.4s, v26.s[0]\n"
       "cbnz x27, 124b\n"
       "125:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1763,41 +1763,41 @@
       "add x23, x24, x20, LSL #2\n"
       "tbz %x[flags], #1, 126f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v25.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmin v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v1.4s\n"
-      "fmin v22.4s, v22.4s, v1.4s\n"
-      "fmin v23.4s, v23.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
-      "fmax v20.4s, v20.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v0.4s\n"
-      "fmax v22.4s, v22.4s, v0.4s\n"
-      "fmax v23.4s, v23.4s, v0.4s\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v25.4s\n"
+      "fmin v9.4s, v9.4s, v25.4s\n"
+      "fmin v10.4s, v10.4s, v25.4s\n"
+      "fmin v11.4s, v11.4s, v25.4s\n"
+      "fmin v12.4s, v12.4s, v25.4s\n"
+      "fmin v13.4s, v13.4s, v25.4s\n"
+      "fmin v14.4s, v14.4s, v25.4s\n"
+      "fmin v15.4s, v15.4s, v25.4s\n"
+      "fmin v16.4s, v16.4s, v25.4s\n"
+      "fmin v17.4s, v17.4s, v25.4s\n"
+      "fmin v18.4s, v18.4s, v25.4s\n"
+      "fmin v19.4s, v19.4s, v25.4s\n"
+      "fmin v20.4s, v20.4s, v25.4s\n"
+      "fmin v21.4s, v21.4s, v25.4s\n"
+      "fmin v22.4s, v22.4s, v25.4s\n"
+      "fmin v23.4s, v23.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v24.4s\n"
+      "fmax v9.4s, v9.4s, v24.4s\n"
+      "fmax v10.4s, v10.4s, v24.4s\n"
+      "fmax v11.4s, v11.4s, v24.4s\n"
+      "fmax v12.4s, v12.4s, v24.4s\n"
+      "fmax v13.4s, v13.4s, v24.4s\n"
+      "fmax v14.4s, v14.4s, v24.4s\n"
+      "fmax v15.4s, v15.4s, v24.4s\n"
+      "fmax v16.4s, v16.4s, v24.4s\n"
+      "fmax v17.4s, v17.4s, v24.4s\n"
+      "fmax v18.4s, v18.4s, v24.4s\n"
+      "fmax v19.4s, v19.4s, v24.4s\n"
+      "fmax v20.4s, v20.4s, v24.4s\n"
+      "fmax v21.4s, v21.4s, v24.4s\n"
+      "fmax v22.4s, v22.4s, v24.4s\n"
+      "fmax v23.4s, v23.4s, v24.4s\n"
       "126:"  // Height 4: No activation
       "cmp x14, #0x10\n"
       "bge 135f\n"
@@ -2140,15 +2140,15 @@
       "152:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 153f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 154f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -2159,10 +2159,10 @@
       "b 154f\n"
       "153:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "154:"  // Height 5: input setup done
       "cmp x27, #0x4\n"
       "blt 157f\n"
@@ -2185,7 +2185,7 @@
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q29, [x10, #0x0]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "add x24, x24, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -2194,98 +2194,98 @@
       "add x22, x22, #0x10\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "fmla v24.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "fmla v25.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "fmla v26.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "fmla v27.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "fmla v24.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "fmla v25.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "fmla v26.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "ldr q28, [x9, #0x0]\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "fmla v18.4s, v29.4s, v2.s[0]\n"
+      "fmla v22.4s, v29.4s, v3.s[0]\n"
+      "fmla v26.4s, v29.4s, v4.s[0]\n"
+      "ldr q29, [x12, #0x10]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v2.s[0]\n"
+      "fmla v23.4s, v28.4s, v3.s[0]\n"
+      "fmla v27.4s, v28.4s, v4.s[0]\n"
+      "ldr q28, [x11, #0x10]\n"
+      "fmla v8.4s, v29.4s, v0.s[1]\n"
+      "fmla v12.4s, v29.4s, v1.s[1]\n"
+      "fmla v16.4s, v29.4s, v2.s[1]\n"
+      "fmla v20.4s, v29.4s, v3.s[1]\n"
+      "fmla v24.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x10, #0x10]\n"
+      "fmla v9.4s, v28.4s, v0.s[1]\n"
+      "fmla v13.4s, v28.4s, v1.s[1]\n"
+      "fmla v17.4s, v28.4s, v2.s[1]\n"
+      "fmla v21.4s, v28.4s, v3.s[1]\n"
+      "fmla v25.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x9, #0x10]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v14.4s, v29.4s, v1.s[1]\n"
+      "fmla v18.4s, v29.4s, v2.s[1]\n"
+      "fmla v22.4s, v29.4s, v3.s[1]\n"
+      "fmla v26.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x12, #0x20]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v15.4s, v28.4s, v1.s[1]\n"
+      "fmla v19.4s, v28.4s, v2.s[1]\n"
+      "fmla v23.4s, v28.4s, v3.s[1]\n"
+      "fmla v27.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x11, #0x20]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v12.4s, v29.4s, v1.s[2]\n"
+      "fmla v16.4s, v29.4s, v2.s[2]\n"
+      "fmla v20.4s, v29.4s, v3.s[2]\n"
+      "fmla v24.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x10, #0x20]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v13.4s, v28.4s, v1.s[2]\n"
+      "fmla v17.4s, v28.4s, v2.s[2]\n"
+      "fmla v21.4s, v28.4s, v3.s[2]\n"
+      "fmla v25.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x9, #0x20]\n"
+      "fmla v10.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v18.4s, v29.4s, v2.s[2]\n"
+      "fmla v22.4s, v29.4s, v3.s[2]\n"
+      "fmla v26.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x12, #0x30]\n"
+      "fmla v11.4s, v28.4s, v0.s[2]\n"
       "add x12, x12, #0x40\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "fmla v27.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x11, #0x30]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v19.4s, v28.4s, v2.s[2]\n"
+      "fmla v23.4s, v28.4s, v3.s[2]\n"
+      "fmla v27.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x11, #0x30]\n"
       "add x11, x11, #0x40\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "fmla v24.4s, v6.4s, v4.s[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v8.4s, v29.4s, v0.s[3]\n"
+      "fmla v12.4s, v29.4s, v1.s[3]\n"
+      "fmla v16.4s, v29.4s, v2.s[3]\n"
+      "fmla v20.4s, v29.4s, v3.s[3]\n"
+      "fmla v24.4s, v29.4s, v4.s[3]\n"
+      "ldr q29, [x10, #0x30]\n"
+      "fmla v9.4s, v28.4s, v0.s[3]\n"
       "add x10, x10, #0x40\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "fmla v25.4s, v7.4s, v4.s[3]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "fmla v13.4s, v28.4s, v1.s[3]\n"
+      "fmla v17.4s, v28.4s, v2.s[3]\n"
+      "fmla v21.4s, v28.4s, v3.s[3]\n"
+      "fmla v25.4s, v28.4s, v4.s[3]\n"
+      "ldr q28, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "fmla v14.4s, v29.4s, v1.s[3]\n"
+      "fmla v18.4s, v29.4s, v2.s[3]\n"
+      "fmla v22.4s, v29.4s, v3.s[3]\n"
+      "fmla v26.4s, v29.4s, v4.s[3]\n"
       "ldr q6, [x12, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v15.4s, v28.4s, v1.s[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v19.4s, v28.4s, v2.s[3]\n"
       "ldr q2, [x24, #0x0]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v23.4s, v28.4s, v3.s[3]\n"
       "ldr q3, [x23, #0x0]\n"
-      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "fmla v27.4s, v28.4s, v4.s[3]\n"
       "ldr q4, [x22, #0x0]\n"
       "ldr q7, [x11, #0x0]\n"
       "bge 155b\n"
@@ -2299,7 +2299,7 @@
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "ldr q29, [x10, #0x0]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "add x23, x23, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -2307,130 +2307,130 @@
       "add x22, x22, #0x10\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x12, #0x10]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "fmla v24.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "fmla v25.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x9, #0x10]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "fmla v26.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x12, #0x20]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "fmla v27.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x11, #0x20]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "fmla v24.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "fmla v25.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x9, #0x20]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "fmla v26.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x12, #0x30]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "ldr q28, [x9, #0x0]\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "fmla v18.4s, v29.4s, v2.s[0]\n"
+      "fmla v22.4s, v29.4s, v3.s[0]\n"
+      "fmla v26.4s, v29.4s, v4.s[0]\n"
+      "ldr q29, [x12, #0x10]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v2.s[0]\n"
+      "fmla v23.4s, v28.4s, v3.s[0]\n"
+      "fmla v27.4s, v28.4s, v4.s[0]\n"
+      "ldr q28, [x11, #0x10]\n"
+      "fmla v8.4s, v29.4s, v0.s[1]\n"
+      "fmla v12.4s, v29.4s, v1.s[1]\n"
+      "fmla v16.4s, v29.4s, v2.s[1]\n"
+      "fmla v20.4s, v29.4s, v3.s[1]\n"
+      "fmla v24.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x10, #0x10]\n"
+      "fmla v9.4s, v28.4s, v0.s[1]\n"
+      "fmla v13.4s, v28.4s, v1.s[1]\n"
+      "fmla v17.4s, v28.4s, v2.s[1]\n"
+      "fmla v21.4s, v28.4s, v3.s[1]\n"
+      "fmla v25.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x9, #0x10]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v14.4s, v29.4s, v1.s[1]\n"
+      "fmla v18.4s, v29.4s, v2.s[1]\n"
+      "fmla v22.4s, v29.4s, v3.s[1]\n"
+      "fmla v26.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x12, #0x20]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v15.4s, v28.4s, v1.s[1]\n"
+      "fmla v19.4s, v28.4s, v2.s[1]\n"
+      "fmla v23.4s, v28.4s, v3.s[1]\n"
+      "fmla v27.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x11, #0x20]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v12.4s, v29.4s, v1.s[2]\n"
+      "fmla v16.4s, v29.4s, v2.s[2]\n"
+      "fmla v20.4s, v29.4s, v3.s[2]\n"
+      "fmla v24.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x10, #0x20]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v13.4s, v28.4s, v1.s[2]\n"
+      "fmla v17.4s, v28.4s, v2.s[2]\n"
+      "fmla v21.4s, v28.4s, v3.s[2]\n"
+      "fmla v25.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x9, #0x20]\n"
+      "fmla v10.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v18.4s, v29.4s, v2.s[2]\n"
+      "fmla v22.4s, v29.4s, v3.s[2]\n"
+      "fmla v26.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x12, #0x30]\n"
+      "fmla v11.4s, v28.4s, v0.s[2]\n"
       "add x12, x12, #0x40\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "fmla v27.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x11, #0x30]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v19.4s, v28.4s, v2.s[2]\n"
+      "fmla v23.4s, v28.4s, v3.s[2]\n"
+      "fmla v27.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x11, #0x30]\n"
       "add x11, x11, #0x40\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "fmla v24.4s, v6.4s, v4.s[3]\n"
-      "ldr q6, [x10, #0x30]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v8.4s, v29.4s, v0.s[3]\n"
+      "fmla v12.4s, v29.4s, v1.s[3]\n"
+      "fmla v16.4s, v29.4s, v2.s[3]\n"
+      "fmla v20.4s, v29.4s, v3.s[3]\n"
+      "fmla v24.4s, v29.4s, v4.s[3]\n"
+      "ldr q29, [x10, #0x30]\n"
+      "fmla v9.4s, v28.4s, v0.s[3]\n"
       "add x10, x10, #0x40\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "fmla v25.4s, v7.4s, v4.s[3]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "fmla v13.4s, v28.4s, v1.s[3]\n"
+      "fmla v17.4s, v28.4s, v2.s[3]\n"
+      "fmla v21.4s, v28.4s, v3.s[3]\n"
+      "fmla v25.4s, v28.4s, v4.s[3]\n"
+      "ldr q28, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v26.4s, v6.4s, v4.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
-      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "fmla v14.4s, v29.4s, v1.s[3]\n"
+      "fmla v18.4s, v29.4s, v2.s[3]\n"
+      "fmla v22.4s, v29.4s, v3.s[3]\n"
+      "fmla v26.4s, v29.4s, v4.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "fmla v15.4s, v28.4s, v1.s[3]\n"
+      "fmla v19.4s, v28.4s, v2.s[3]\n"
+      "fmla v23.4s, v28.4s, v3.s[3]\n"
+      "fmla v27.4s, v28.4s, v4.s[3]\n"
       "157:"  // Height 5: Multiply loop: Main loop skip
       "cbz x27, 159f\n"
       "158:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
       "ldr s1, [x25], #0x4\n"
       "sub x27, x27, #0x1\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr q6, [x12, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q7, [x11, #0x0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s31, [x23], #0x4\n"
+      "ldr s30, [x22], #0x4\n"
+      "ldr q29, [x12, #0x0]\n"
+      "fmla v8.4s, v29.4s, v2.s[0]\n"
+      "fmla v12.4s, v29.4s, v1.s[0]\n"
+      "ldr q28, [x11, #0x0]\n"
+      "fmla v16.4s, v29.4s, v0.s[0]\n"
+      "fmla v20.4s, v29.4s, v31.s[0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v24.4s, v29.4s, v30.s[0]\n"
+      "ldr q29, [x10, #0x0]\n"
+      "fmla v9.4s, v28.4s, v2.s[0]\n"
       "add x11, x11, #0x10\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "fmla v13.4s, v28.4s, v1.s[0]\n"
+      "fmla v17.4s, v28.4s, v0.s[0]\n"
       "add x10, x10, #0x10\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "fmla v21.4s, v28.4s, v31.s[0]\n"
+      "fmla v25.4s, v28.4s, v30.s[0]\n"
+      "ldr q28, [x9, #0x0]\n"
       "add x9, x9, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "fmla v10.4s, v29.4s, v2.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "fmla v18.4s, v29.4s, v0.s[0]\n"
+      "fmla v22.4s, v29.4s, v31.s[0]\n"
+      "fmla v26.4s, v29.4s, v30.s[0]\n"
+      "fmla v11.4s, v28.4s, v2.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v0.s[0]\n"
+      "fmla v23.4s, v28.4s, v31.s[0]\n"
+      "fmla v27.4s, v28.4s, v30.s[0]\n"
       "cbnz x27, 158b\n"
       "159:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2444,49 +2444,49 @@
       "add x22, x23, x20, LSL #2\n"
       "tbz %x[flags], #1, 160f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v29.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmin v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v1.4s\n"
-      "fmin v22.4s, v22.4s, v1.4s\n"
-      "fmin v23.4s, v23.4s, v1.4s\n"
-      "fmin v24.4s, v24.4s, v1.4s\n"
-      "fmin v25.4s, v25.4s, v1.4s\n"
-      "fmin v26.4s, v26.4s, v1.4s\n"
-      "fmin v27.4s, v27.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
-      "fmax v20.4s, v20.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v0.4s\n"
-      "fmax v22.4s, v22.4s, v0.4s\n"
-      "fmax v23.4s, v23.4s, v0.4s\n"
-      "fmax v24.4s, v24.4s, v0.4s\n"
-      "fmax v25.4s, v25.4s, v0.4s\n"
-      "fmax v26.4s, v26.4s, v0.4s\n"
-      "fmax v27.4s, v27.4s, v0.4s\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v29.4s\n"
+      "fmin v9.4s, v9.4s, v29.4s\n"
+      "fmin v10.4s, v10.4s, v29.4s\n"
+      "fmin v11.4s, v11.4s, v29.4s\n"
+      "fmin v12.4s, v12.4s, v29.4s\n"
+      "fmin v13.4s, v13.4s, v29.4s\n"
+      "fmin v14.4s, v14.4s, v29.4s\n"
+      "fmin v15.4s, v15.4s, v29.4s\n"
+      "fmin v16.4s, v16.4s, v29.4s\n"
+      "fmin v17.4s, v17.4s, v29.4s\n"
+      "fmin v18.4s, v18.4s, v29.4s\n"
+      "fmin v19.4s, v19.4s, v29.4s\n"
+      "fmin v20.4s, v20.4s, v29.4s\n"
+      "fmin v21.4s, v21.4s, v29.4s\n"
+      "fmin v22.4s, v22.4s, v29.4s\n"
+      "fmin v23.4s, v23.4s, v29.4s\n"
+      "fmin v24.4s, v24.4s, v29.4s\n"
+      "fmin v25.4s, v25.4s, v29.4s\n"
+      "fmin v26.4s, v26.4s, v29.4s\n"
+      "fmin v27.4s, v27.4s, v29.4s\n"
+      "fmax v8.4s, v8.4s, v28.4s\n"
+      "fmax v9.4s, v9.4s, v28.4s\n"
+      "fmax v10.4s, v10.4s, v28.4s\n"
+      "fmax v11.4s, v11.4s, v28.4s\n"
+      "fmax v12.4s, v12.4s, v28.4s\n"
+      "fmax v13.4s, v13.4s, v28.4s\n"
+      "fmax v14.4s, v14.4s, v28.4s\n"
+      "fmax v15.4s, v15.4s, v28.4s\n"
+      "fmax v16.4s, v16.4s, v28.4s\n"
+      "fmax v17.4s, v17.4s, v28.4s\n"
+      "fmax v18.4s, v18.4s, v28.4s\n"
+      "fmax v19.4s, v19.4s, v28.4s\n"
+      "fmax v20.4s, v20.4s, v28.4s\n"
+      "fmax v21.4s, v21.4s, v28.4s\n"
+      "fmax v22.4s, v22.4s, v28.4s\n"
+      "fmax v23.4s, v23.4s, v28.4s\n"
+      "fmax v24.4s, v24.4s, v28.4s\n"
+      "fmax v25.4s, v25.4s, v28.4s\n"
+      "fmax v26.4s, v26.4s, v28.4s\n"
+      "fmax v27.4s, v27.4s, v28.4s\n"
       "160:"  // Height 5: No activation
       "cmp x14, #0x10\n"
       "bge 169f\n"
@@ -2881,16 +2881,16 @@
       "186:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 187f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 188f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -2902,11 +2902,11 @@
       "b 188f\n"
       "187:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "188:"  // Height 6: input setup done
       "cmp x27, #0x4\n"
       "blt 191f\n"
@@ -3177,45 +3177,45 @@
       "191:"  // Height 6: Multiply loop: Main loop skip
       "cbz x27, 193f\n"
       "192:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s7, [x26], #0x4\n"
+      "ldr s6, [x25], #0x4\n"
       "sub x27, x27, #0x1\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr s5, [x21], #0x4\n"
-      "ldr q6, [x12, #0x0]\n"
-      "ldr q7, [x11, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s3, [x22], #0x4\n"
+      "ldr s2, [x21], #0x4\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q0, [x11, #0x0]\n"
+      "fmla v8.4s, v1.4s, v7.s[0]\n"
+      "fmla v12.4s, v1.4s, v6.s[0]\n"
+      "fmla v16.4s, v1.4s, v5.s[0]\n"
+      "fmla v20.4s, v1.4s, v4.s[0]\n"
       "add x12, x12, #0x10\n"
       "add x11, x11, #0x10\n"
-      "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "fmla v28.4s, v6.4s, v5.s[0]\n"
-      "ldr q6, [x10, #0x0]\n"
+      "fmla v24.4s, v1.4s, v3.s[0]\n"
+      "fmla v28.4s, v1.4s, v2.s[0]\n"
+      "ldr q1, [x10, #0x0]\n"
       "add x10, x10, #0x10\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "fmla v29.4s, v7.4s, v5.s[0]\n"
-      "ldr q7, [x9, #0x0]\n"
+      "fmla v9.4s, v0.4s, v7.s[0]\n"
+      "fmla v13.4s, v0.4s, v6.s[0]\n"
+      "fmla v17.4s, v0.4s, v5.s[0]\n"
+      "fmla v21.4s, v0.4s, v4.s[0]\n"
+      "fmla v25.4s, v0.4s, v3.s[0]\n"
+      "fmla v29.4s, v0.4s, v2.s[0]\n"
+      "ldr q0, [x9, #0x0]\n"
       "add x9, x9, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "fmla v30.4s, v6.4s, v5.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "fmla v31.4s, v7.4s, v5.s[0]\n"
+      "fmla v10.4s, v1.4s, v7.s[0]\n"
+      "fmla v14.4s, v1.4s, v6.s[0]\n"
+      "fmla v18.4s, v1.4s, v5.s[0]\n"
+      "fmla v22.4s, v1.4s, v4.s[0]\n"
+      "fmla v26.4s, v1.4s, v3.s[0]\n"
+      "fmla v30.4s, v1.4s, v2.s[0]\n"
+      "fmla v11.4s, v0.4s, v7.s[0]\n"
+      "fmla v15.4s, v0.4s, v6.s[0]\n"
+      "fmla v19.4s, v0.4s, v5.s[0]\n"
+      "fmla v23.4s, v0.4s, v4.s[0]\n"
+      "fmla v27.4s, v0.4s, v3.s[0]\n"
+      "fmla v31.4s, v0.4s, v2.s[0]\n"
       "cbnz x27, 192b\n"
       "193:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp
index af2c1e5..923d008 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp
index 1f707fa..8961e61 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp
@@ -283,11 +283,11 @@
       "21:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 22f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
       "cbnz x26, 23f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -307,32 +307,32 @@
       "24:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x0]\n"
       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x10, #0x10]\n"
+      "ldr q23, [x10, #0x10]\n"
       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q22, [x9, #0x0]\n"
       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x28, #0x0]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x28, #0x10]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x27, #0x0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x27, #0x10]\n"
+      "ldr q21, [x9, #0x10]\n"
+      ".inst 0x6e58ec0a  // bfmmla v10.4s, v0.8h, v24.8h\n"
+      "ldr q24, [x28, #0x0]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q23, [x28, #0x10]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      "ldr q22, [x27, #0x0]\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x27, #0x10]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x8\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e58ec0c  // bfmmla v12.4s, v0.8h, v24.8h\n"
       "add x12, x12, #0x20\n"
       "ldr q4, [x12, #0x0]\n"
       "add x11, x11, #0x20\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
       "ldr q5, [x12, #0x10]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
       "ldr q6, [x11, #0x0]\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
       "ld1 { v0.4s }, [x24], #0x10\n"
       "ldr q7, [x11, #0x10]\n"
       "add x10, x10, #0x20\n"
@@ -343,28 +343,28 @@
       "25:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q22, [x10, #0x0]\n"
       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x10, #0x10]\n"
+      "ldr q25, [x10, #0x10]\n"
       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q21, [x9, #0x0]\n"
       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x28, #0x0]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x28, #0x10]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x27, #0x0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x27, #0x10]\n"
+      "ldr q24, [x9, #0x10]\n"
+      ".inst 0x6e56ec0a  // bfmmla v10.4s, v0.8h, v22.8h\n"
+      "ldr q23, [x28, #0x0]\n"
+      ".inst 0x6e59ec10  // bfmmla v16.4s, v0.8h, v25.8h\n"
+      "ldr q22, [x28, #0x10]\n"
+      ".inst 0x6e55ec0b  // bfmmla v11.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x27, #0x0]\n"
+      ".inst 0x6e58ec11  // bfmmla v17.4s, v0.8h, v24.8h\n"
+      "ldr q3, [x27, #0x10]\n"
       "sub x25, x25, #0x4\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e57ec0c  // bfmmla v12.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e56ec12  // bfmmla v18.4s, v0.8h, v22.8h\n"
       "add x12, x12, #0x20\n"
       "add x11, x11, #0x20\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e55ec0d  // bfmmla v13.4s, v0.8h, v21.8h\n"
+      ".inst 0x6e43ec13  // bfmmla v19.4s, v0.8h, v3.8h\n"
       "add x10, x10, #0x20\n"
       "add x9, x9, #0x20\n"
       "add x28, x28, #0x20\n"
@@ -380,31 +380,31 @@
       "27:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr s0, [x24, #0x0]\n"
       "28:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q4, [x12, #0x0]\n"
-      "ldr q5, [x12, #0x10]\n"
+      "ldr q21, [x12, #0x0]\n"
+      "ldr q30, [x12, #0x10]\n"
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
-      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      "ldr q6, [x11, #0x0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q4, [x10, #0x0]\n"
-      "ldr q5, [x10, #0x10]\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      "ldr q4, [x28, #0x0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
-      "ldr q6, [x27, #0x0]\n"
-      "ldr q7, [x27, #0x10]\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e55ec08  // bfmmla v8.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x11, #0x0]\n"
+      "ldr q22, [x11, #0x10]\n"
+      ".inst 0x6e5eec0e  // bfmmla v14.4s, v0.8h, v30.8h\n"
+      ".inst 0x6e55ec09  // bfmmla v9.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x10, #0x0]\n"
+      "ldr q23, [x10, #0x10]\n"
+      ".inst 0x6e56ec0f  // bfmmla v15.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec0a  // bfmmla v10.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x9, #0x0]\n"
+      "ldr q22, [x9, #0x10]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e55ec0b  // bfmmla v11.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0x0]\n"
+      "ldr q23, [x28, #0x10]\n"
+      ".inst 0x6e56ec11  // bfmmla v17.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec0c  // bfmmla v12.4s, v0.8h, v21.8h\n"
+      "ldr q22, [x27, #0x0]\n"
+      "ldr q21, [x27, #0x10]\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
       "add x12, x12, #0x20\n"
       "add x11, x11, #0x20\n"
       "add x10, x10, #0x20\n"
@@ -424,21 +424,21 @@
       "uzp1 v13.2d, v13.2d, v19.2d\n"
       "tbz %x[flags], #1, 30f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v22.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v22.4s\n"
+      "fmin v9.4s, v9.4s, v22.4s\n"
+      "fmin v10.4s, v10.4s, v22.4s\n"
+      "fmin v11.4s, v11.4s, v22.4s\n"
+      "fmin v12.4s, v12.4s, v22.4s\n"
+      "fmin v13.4s, v13.4s, v22.4s\n"
+      "fmax v8.4s, v8.4s, v21.4s\n"
+      "fmax v9.4s, v9.4s, v21.4s\n"
+      "fmax v10.4s, v10.4s, v21.4s\n"
+      "fmax v11.4s, v11.4s, v21.4s\n"
+      "fmax v12.4s, v12.4s, v21.4s\n"
+      "fmax v13.4s, v13.4s, v21.4s\n"
       "30:"  // Height 1: No activation
       "cmp x14, #0x18\n"
       "bge 43f\n"
@@ -744,12 +744,12 @@
       "65:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 66f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
       "cbnz x26, 67f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -757,7 +757,7 @@
       "b 67f\n"
       "66:"  // Height 2: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "67:"  // Height 2: input setup done
       "cmp x25, #0x4\n"
       "blt 70f\n"
@@ -774,32 +774,32 @@
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
       "ld1 { v1.4s }, [x23], #0x10\n"
       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q30, [x10, #0x0]\n"
       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x10, #0x10]\n"
+      "ldr q23, [x10, #0x10]\n"
       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q22, [x9, #0x0]\n"
       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x28, #0x0]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x28, #0x10]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x27, #0x0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x27, #0x10]\n"
+      "ldr q21, [x9, #0x10]\n"
+      ".inst 0x6e5eec0a  // bfmmla v10.4s, v0.8h, v30.8h\n"
+      "ldr q2, [x28, #0x0]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q23, [x28, #0x10]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      "ldr q22, [x27, #0x0]\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x27, #0x10]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x8\n"
       "add x12, x12, #0x20\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e42ec0c  // bfmmla v12.4s, v0.8h, v2.8h\n"
       "ldr q4, [x12, #0x0]\n"
       "add x11, x11, #0x20\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
       "ldr q5, [x12, #0x10]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
       "ldr q6, [x11, #0x0]\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
       "ld1 { v0.4s }, [x24], #0x10\n"
       "add x10, x10, #0x20\n"
       "ldr q7, [x11, #0x10]\n"
@@ -811,28 +811,28 @@
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x0]\n"
       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x10, #0x10]\n"
+      "ldr q23, [x10, #0x10]\n"
       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x9, #0x0]\n"
+      "ldr q22, [x9, #0x0]\n"
       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x28, #0x0]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x28, #0x10]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x27, #0x0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x27, #0x10]\n"
+      "ldr q21, [x9, #0x10]\n"
+      ".inst 0x6e58ec0a  // bfmmla v10.4s, v0.8h, v24.8h\n"
+      "ldr q24, [x28, #0x0]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q23, [x28, #0x10]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      "ldr q22, [x27, #0x0]\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x27, #0x10]\n"
       "sub x25, x25, #0x4\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e58ec0c  // bfmmla v12.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
       "add x12, x12, #0x20\n"
       "add x11, x11, #0x20\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
       "add x10, x10, #0x20\n"
       "add x9, x9, #0x20\n"
       "add x28, x28, #0x20\n"
@@ -851,32 +851,32 @@
       "ldr s0, [x24, #0x0]\n"
       "ldr s1, [x23, #0x0]\n"
       "72:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q4, [x12, #0x0]\n"
-      "ldr q5, [x12, #0x10]\n"
+      "ldr q24, [x12, #0x0]\n"
+      "ldr q23, [x12, #0x10]\n"
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
-      "ldr q6, [x11, #0x0]\n"
-      "ldr q7, [x11, #0x10]\n"
-      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      "ldr q4, [x10, #0x0]\n"
-      "ldr q5, [x10, #0x10]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      "ldr q4, [x28, #0x0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      "ldr q6, [x27, #0x0]\n"
-      "ldr q7, [x27, #0x10]\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      "ldr q22, [x11, #0x0]\n"
+      "ldr q21, [x11, #0x10]\n"
+      ".inst 0x6e58ec08  // bfmmla v8.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec0e  // bfmmla v14.4s, v0.8h, v23.8h\n"
+      "ldr q24, [x10, #0x0]\n"
+      "ldr q23, [x10, #0x10]\n"
+      ".inst 0x6e56ec09  // bfmmla v9.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec0f  // bfmmla v15.4s, v0.8h, v21.8h\n"
+      "ldr q22, [x9, #0x0]\n"
+      "ldr q21, [x9, #0x10]\n"
+      ".inst 0x6e58ec0a  // bfmmla v10.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q23, [x28, #0x10]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q22, [x27, #0x0]\n"
+      "ldr q21, [x27, #0x10]\n"
+      ".inst 0x6e58ec0c  // bfmmla v12.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
       "add x12, x12, #0x20\n"
       "add x11, x11, #0x20\n"
       "add x10, x10, #0x20\n"
@@ -904,33 +904,33 @@
       "uzp2 v13.2d, v13.2d, v19.2d\n"
       "tbz %x[flags], #1, 74f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v22.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v4.4s, v4.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmax v4.4s, v4.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "fmin v4.4s, v4.4s, v22.4s\n"
+      "fmin v14.4s, v14.4s, v22.4s\n"
+      "fmin v15.4s, v15.4s, v22.4s\n"
+      "fmin v16.4s, v16.4s, v22.4s\n"
+      "fmin v17.4s, v17.4s, v22.4s\n"
+      "fmin v18.4s, v18.4s, v22.4s\n"
+      "fmin v8.4s, v8.4s, v22.4s\n"
+      "fmin v9.4s, v9.4s, v22.4s\n"
+      "fmin v10.4s, v10.4s, v22.4s\n"
+      "fmin v11.4s, v11.4s, v22.4s\n"
+      "fmin v12.4s, v12.4s, v22.4s\n"
+      "fmin v13.4s, v13.4s, v22.4s\n"
+      "fmax v4.4s, v4.4s, v21.4s\n"
+      "fmax v14.4s, v14.4s, v21.4s\n"
+      "fmax v15.4s, v15.4s, v21.4s\n"
+      "fmax v16.4s, v16.4s, v21.4s\n"
+      "fmax v17.4s, v17.4s, v21.4s\n"
+      "fmax v18.4s, v18.4s, v21.4s\n"
+      "fmax v8.4s, v8.4s, v21.4s\n"
+      "fmax v9.4s, v9.4s, v21.4s\n"
+      "fmax v10.4s, v10.4s, v21.4s\n"
+      "fmax v11.4s, v11.4s, v21.4s\n"
+      "fmax v12.4s, v12.4s, v21.4s\n"
+      "fmax v13.4s, v13.4s, v21.4s\n"
       "74:"  // Height 2: No activation
       "cmp x14, #0x18\n"
       "bge 87f\n"
@@ -1339,13 +1339,13 @@
       "109:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 110f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
       "cbnz x26, 111f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -1354,8 +1354,8 @@
       "b 111f\n"
       "110:"  // Height 3: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "111:"  // Height 3: input setup done
       "cmp x25, #0x4\n"
       "blt 114f\n"
@@ -1386,7 +1386,7 @@
       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
       "cmp x25, #0x8\n"
       ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x9, #0x10]\n"
+      "ldr q3, [x9, #0x10]\n"
       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
       "add x12, x12, #0x20\n"
       ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
@@ -1399,10 +1399,10 @@
       "add x10, x10, #0x20\n"
       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
       "ldr q6, [x27, #0x0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec11  // bfmmla v17.4s, v0.8h, v3.8h\n"
       "add x9, x9, #0x20\n"
-      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x27, #0x10]\n"
+      ".inst 0x6e43ec5d  // bfmmla v29.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x27, #0x10]\n"
       "add x28, x28, #0x20\n"
       ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
       "add x27, x27, #0x20\n"
@@ -1414,9 +1414,9 @@
       ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
       ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
       "ldr q6, [x11, #0x0]\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec13  // bfmmla v19.4s, v0.8h, v3.8h\n"
       "ld1 { v0.4s }, [x24], #0x10\n"
-      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec5f  // bfmmla v31.4s, v2.8h, v3.8h\n"
       "ld1 { v2.4s }, [x22], #0x10\n"
       "ldr q7, [x11, #0x10]\n"
       "bge 112b\n"
@@ -1427,10 +1427,10 @@
       "sub x25, x25, #0x4\n"
       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
       ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
       ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x10, #0x10]\n"
+      "ldr q4, [x10, #0x10]\n"
       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
       "add x12, x12, #0x20\n"
       ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
@@ -1438,31 +1438,31 @@
       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
       "add x11, x11, #0x20\n"
       ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "ldr q1, [x9, #0x10]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
       "add x10, x10, #0x20\n"
-      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x28, #0x0]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e43ec56  // bfmmla v22.4s, v2.8h, v3.8h\n"
+      "ldr q5, [x28, #0x0]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
       "add x9, x9, #0x20\n"
-      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x10]\n"
       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
       "add x28, x28, #0x20\n"
       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x27, #0x0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x27, #0x10]\n"
+      "ldr q3, [x27, #0x0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x27, #0x10]\n"
       "add x27, x27, #0x20\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
       "114:"  // Height 3: Multiply loop: Main loop skip
       "cbz x25, 117f\n"
       "cbz x25, 117f\n"
@@ -1480,51 +1480,51 @@
       "ldr s1, [x23, #0x0]\n"
       "ldr s2, [x22, #0x0]\n"
       "116:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q4, [x12, #0x0]\n"
-      "ldr q5, [x12, #0x10]\n"
+      "ldr q5, [x12, #0x0]\n"
+      "ldr q4, [x12, #0x10]\n"
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
-      "ldr q6, [x11, #0x0]\n"
-      "ldr q7, [x11, #0x10]\n"
+      "ldr q3, [x11, #0x0]\n"
+      "ldr q1, [x11, #0x10]\n"
       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
-      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x10, #0x0]\n"
-      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec08  // bfmmla v8.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec54  // bfmmla v20.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x10, #0x0]\n"
+      ".inst 0x6e44ec0e  // bfmmla v14.4s, v0.8h, v4.8h\n"
       "add x12, x12, #0x20\n"
-      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x10, #0x10]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e44ec5a  // bfmmla v26.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x10, #0x10]\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
       "add x11, x11, #0x20\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec55  // bfmmla v21.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x9, #0x0]\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
       "add x10, x10, #0x20\n"
-      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e41ec5b  // bfmmla v27.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x9, #0x10]\n"
+      ".inst 0x6e45ec0a  // bfmmla v10.4s, v0.8h, v5.8h\n"
       "add x9, x9, #0x20\n"
-      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x28, #0x0]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x28, #0x10]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e45ec56  // bfmmla v22.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x0]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x10]\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
       "add x28, x28, #0x20\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x27, #0x0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x27, #0x10]\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e43ec57  // bfmmla v23.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x27, #0x0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x27, #0x10]\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
       "add x27, x27, #0x20\n"
-      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
       "117:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
@@ -2070,14 +2070,14 @@
       "153:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 154f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
       "cbnz x26, 155f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -2087,9 +2087,9 @@
       "b 155f\n"
       "154:"  // Height 4: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "155:"  // Height 4: input setup done
       "cmp x25, #0x4\n"
       "blt 158f\n"
@@ -2167,40 +2167,40 @@
       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
       "add x11, x11, #0x20\n"
       ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x10, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
       ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x10, #0x10]\n"
+      "ldr q4, [x10, #0x10]\n"
       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
       "add x10, x10, #0x20\n"
       ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
       "ldr q6, [x9, #0x0]\n"
       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
       ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "ldr q1, [x9, #0x10]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
       "add x9, x9, #0x20\n"
-      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x28, #0x0]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x6e43ec56  // bfmmla v22.4s, v2.8h, v3.8h\n"
+      "ldr q5, [x28, #0x0]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x10]\n"
       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
       "add x28, x28, #0x20\n"
       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x27, #0x0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x27, #0x10]\n"
+      "ldr q3, [x27, #0x0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x27, #0x10]\n"
       "add x27, x27, #0x20\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
       "158:"  // Height 4: Multiply loop: Main loop skip
       "cbz x25, 161f\n"
       "cbz x25, 161f\n"
@@ -2221,52 +2221,52 @@
       "ldr s2, [x22, #0x0]\n"
       "ldr s3, [x21, #0x0]\n"
       "160:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q4, [x12, #0x0]\n"
-      "ldr q5, [x12, #0x10]\n"
+      "ldr q5, [x12, #0x0]\n"
+      "ldr q4, [x12, #0x10]\n"
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
-      "ldr q6, [x11, #0x0]\n"
-      "ldr q7, [x11, #0x10]\n"
+      "ldr q7, [x11, #0x0]\n"
+      "ldr q6, [x11, #0x10]\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
       ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
-      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x10, #0x0]\n"
+      ".inst 0x6e45ec08  // bfmmla v8.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec54  // bfmmla v20.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x10, #0x0]\n"
       "add x12, x12, #0x20\n"
-      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x10, #0x10]\n"
+      ".inst 0x6e44ec0e  // bfmmla v14.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5a  // bfmmla v26.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x10, #0x10]\n"
       "add x11, x11, #0x20\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x9, #0x0]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      "ldr q3, [x9, #0x0]\n"
       "add x10, x10, #0x20\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x9, #0x10]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec5b  // bfmmla v27.4s, v2.8h, v6.8h\n"
+      "ldr q1, [x9, #0x10]\n"
       "add x9, x9, #0x20\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x28, #0x0]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x6e45ec0a  // bfmmla v10.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec56  // bfmmla v22.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x0]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x10]\n"
       "add x28, x28, #0x20\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x27, #0x0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x27, #0x10]\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec57  // bfmmla v23.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x27, #0x0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x27, #0x10]\n"
       "add x27, x27, #0x20\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
       "161:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp
index e24dab6..745f89e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp
index 2458d6a..5f4fcac 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp
@@ -52,29 +52,29 @@
 
     __asm__ __volatile__(
       "1:"  // Height loop
-      "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n"
-      "ldr x24, [%x[args_ptr], %[offsetof_N]]\n"
-      "str x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "mov x23, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x24, %x[Apanel]\n"
       "2:"  // Width loop
-      "ldr x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
       "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
-      "add x22, x25, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
       "add x21, x22, x20, LSL #1\n"
       "add x20, x21, x20, LSL #1\n"
       "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "cmp x24, #0x8\n"
-      "mov %x[Apanel], x23\n"
+      "cmp x25, #0x8\n"
+      "mov %x[Apanel], x24\n"
       "bgt 3f\n"
-      "cmp x24, #0x4\n"
-      "mov x21, x25\n"
+      "cmp x25, #0x4\n"
+      "mov x21, x23\n"
       "bgt 3f\n"
-      "mov x22, x25\n"
+      "mov x22, x23\n"
       "3:"  // B setup done
       "ldr q0, [%x[Apanel], #0x0]\n"
       "ldr q1, [%x[Apanel], #0x10]\n"
       "movi v8.16b, #0x0\n"
-      "ldr q4, [x25, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
       "ldr q5, [x22, #0x0]\n"
       "movi v9.16b, #0x0\n"
       "ldr q6, [x21, #0x0]\n"
@@ -104,8 +104,8 @@
       "movi v31.16b, #0x0\n"
       "blt 5f\n"
       "4:"  // main loop head
-      "ldr q2, [%x[Apanel], #0x20]\n"
-      "ldr q3, [%x[Apanel], #0x30]\n"
+      "ldr q3, [%x[Apanel], #0x20]\n"
+      "ldr q7, [%x[Apanel], #0x30]\n"
       ".inst 0x4f40f088  // bfdot v8.4s, v4.8h, v0.h[0]\n"
       ".inst 0x4f60f08b  // bfdot v11.4s, v4.8h, v0.h[1]\n"
       ".inst 0x4f40f88e  // bfdot v14.4s, v4.8h, v0.h[2]\n"
@@ -117,11 +117,11 @@
       ".inst 0x4f41f89a  // bfdot v26.4s, v4.8h, v1.h[2]\n"
       "add %x[Apanel], %x[Apanel], #0x40\n"
       ".inst 0x4f61f89d  // bfdot v29.4s, v4.8h, v1.h[3]\n"
-      "ldr q4, [x25, #0x10]\n"
+      "ldr q4, [x23, #0x10]\n"
       ".inst 0x4f40f0a9  // bfdot v9.4s, v5.8h, v0.h[0]\n"
       ".inst 0x4f60f0ac  // bfdot v12.4s, v5.8h, v0.h[1]\n"
       ".inst 0x4f40f8af  // bfdot v15.4s, v5.8h, v0.h[2]\n"
-      "add x25, x25, #0x20\n"
+      "add x23, x23, #0x20\n"
       ".inst 0x4f60f8b2  // bfdot v18.4s, v5.8h, v0.h[3]\n"
       ".inst 0x4f41f0b5  // bfdot v21.4s, v5.8h, v1.h[0]\n"
       ".inst 0x4f61f0b8  // bfdot v24.4s, v5.8h, v1.h[1]\n"
@@ -138,35 +138,35 @@
       ".inst 0x4f61f0d9  // bfdot v25.4s, v6.8h, v1.h[1]\n"
       ".inst 0x4f41f8dc  // bfdot v28.4s, v6.8h, v1.h[2]\n"
       ".inst 0x4f61f8df  // bfdot v31.4s, v6.8h, v1.h[3]\n"
-      "ldr q6, [x21, #0x10]\n"
+      "ldr q2, [x21, #0x10]\n"
       "ldr q1, [%x[Apanel], #0x10]\n"
       "add x21, x21, #0x20\n"
-      ".inst 0x4f42f088  // bfdot v8.4s, v4.8h, v2.h[0]\n"
-      ".inst 0x4f62f08b  // bfdot v11.4s, v4.8h, v2.h[1]\n"
-      ".inst 0x4f42f88e  // bfdot v14.4s, v4.8h, v2.h[2]\n"
-      ".inst 0x4f62f891  // bfdot v17.4s, v4.8h, v2.h[3]\n"
-      ".inst 0x4f43f094  // bfdot v20.4s, v4.8h, v3.h[0]\n"
-      ".inst 0x4f63f097  // bfdot v23.4s, v4.8h, v3.h[1]\n"
-      ".inst 0x4f43f89a  // bfdot v26.4s, v4.8h, v3.h[2]\n"
-      ".inst 0x4f63f89d  // bfdot v29.4s, v4.8h, v3.h[3]\n"
-      "ldr q4, [x25, #0x0]\n"
-      ".inst 0x4f42f0a9  // bfdot v9.4s, v5.8h, v2.h[0]\n"
-      ".inst 0x4f62f0ac  // bfdot v12.4s, v5.8h, v2.h[1]\n"
-      ".inst 0x4f42f8af  // bfdot v15.4s, v5.8h, v2.h[2]\n"
-      ".inst 0x4f62f8b2  // bfdot v18.4s, v5.8h, v2.h[3]\n"
-      ".inst 0x4f43f0b5  // bfdot v21.4s, v5.8h, v3.h[0]\n"
-      ".inst 0x4f63f0b8  // bfdot v24.4s, v5.8h, v3.h[1]\n"
-      ".inst 0x4f43f8bb  // bfdot v27.4s, v5.8h, v3.h[2]\n"
-      ".inst 0x4f63f8be  // bfdot v30.4s, v5.8h, v3.h[3]\n"
+      ".inst 0x4f43f088  // bfdot v8.4s, v4.8h, v3.h[0]\n"
+      ".inst 0x4f63f08b  // bfdot v11.4s, v4.8h, v3.h[1]\n"
+      ".inst 0x4f43f88e  // bfdot v14.4s, v4.8h, v3.h[2]\n"
+      ".inst 0x4f63f891  // bfdot v17.4s, v4.8h, v3.h[3]\n"
+      ".inst 0x4f47f094  // bfdot v20.4s, v4.8h, v7.h[0]\n"
+      ".inst 0x4f67f097  // bfdot v23.4s, v4.8h, v7.h[1]\n"
+      ".inst 0x4f47f89a  // bfdot v26.4s, v4.8h, v7.h[2]\n"
+      ".inst 0x4f67f89d  // bfdot v29.4s, v4.8h, v7.h[3]\n"
+      "ldr q4, [x23, #0x0]\n"
+      ".inst 0x4f43f0a9  // bfdot v9.4s, v5.8h, v3.h[0]\n"
+      ".inst 0x4f63f0ac  // bfdot v12.4s, v5.8h, v3.h[1]\n"
+      ".inst 0x4f43f8af  // bfdot v15.4s, v5.8h, v3.h[2]\n"
+      ".inst 0x4f63f8b2  // bfdot v18.4s, v5.8h, v3.h[3]\n"
+      ".inst 0x4f47f0b5  // bfdot v21.4s, v5.8h, v7.h[0]\n"
+      ".inst 0x4f67f0b8  // bfdot v24.4s, v5.8h, v7.h[1]\n"
+      ".inst 0x4f47f8bb  // bfdot v27.4s, v5.8h, v7.h[2]\n"
+      ".inst 0x4f67f8be  // bfdot v30.4s, v5.8h, v7.h[3]\n"
       "ldr q5, [x22, #0x0]\n"
-      ".inst 0x4f42f0ca  // bfdot v10.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f62f0cd  // bfdot v13.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f62f8d3  // bfdot v19.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f63f0d9  // bfdot v25.4s, v6.8h, v3.h[1]\n"
-      ".inst 0x4f43f8dc  // bfdot v28.4s, v6.8h, v3.h[2]\n"
-      ".inst 0x4f63f8df  // bfdot v31.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f43f04a  // bfdot v10.4s, v2.8h, v3.h[0]\n"
+      ".inst 0x4f63f04d  // bfdot v13.4s, v2.8h, v3.h[1]\n"
+      ".inst 0x4f43f850  // bfdot v16.4s, v2.8h, v3.h[2]\n"
+      ".inst 0x4f63f853  // bfdot v19.4s, v2.8h, v3.h[3]\n"
+      ".inst 0x4f47f056  // bfdot v22.4s, v2.8h, v7.h[0]\n"
+      ".inst 0x4f67f059  // bfdot v25.4s, v2.8h, v7.h[1]\n"
+      ".inst 0x4f47f85c  // bfdot v28.4s, v2.8h, v7.h[2]\n"
+      ".inst 0x4f67f85f  // bfdot v31.4s, v2.8h, v7.h[3]\n"
       "ldr q6, [x21, #0x0]\n"
       "bge 4b\n"
       "5:"  // main loop skip
@@ -175,7 +175,7 @@
       "add %x[Apanel], %x[Apanel], #0x20\n"
       ".inst 0x4f40f88e  // bfdot v14.4s, v4.8h, v0.h[2]\n"
       ".inst 0x4f60f891  // bfdot v17.4s, v4.8h, v0.h[3]\n"
-      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x4f41f094  // bfdot v20.4s, v4.8h, v1.h[0]\n"
       ".inst 0x4f61f097  // bfdot v23.4s, v4.8h, v1.h[1]\n"
       "add x22, x22, #0x10\n"
@@ -199,38 +199,38 @@
       ".inst 0x4f41f8dc  // bfdot v28.4s, v6.8h, v1.h[2]\n"
       ".inst 0x4f61f8df  // bfdot v31.4s, v6.8h, v1.h[3]\n"
       "cbz x20, 6f\n"
-      "ldr q0, [%x[Apanel], #0x0]\n"
-      "ldr q1, [%x[Apanel], #0x10]\n"
+      "ldr q4, [%x[Apanel], #0x0]\n"
+      "ldr q3, [%x[Apanel], #0x10]\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "ldr q7, [x25, #0x0]\n"
-      "ldr q4, [x22, #0x0]\n"
-      ".inst 0x4f40f0e8  // bfdot v8.4s, v7.8h, v0.h[0]\n"
-      "ldr q5, [x21, #0x0]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f40f8ee  // bfdot v14.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f60f8f1  // bfdot v17.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f41f0f4  // bfdot v20.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f61f0f7  // bfdot v23.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f41f8fa  // bfdot v26.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f61f8fd  // bfdot v29.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f40f089  // bfdot v9.4s, v4.8h, v0.h[0]\n"
-      ".inst 0x4f60f08c  // bfdot v12.4s, v4.8h, v0.h[1]\n"
-      ".inst 0x4f40f88f  // bfdot v15.4s, v4.8h, v0.h[2]\n"
-      ".inst 0x4f60f892  // bfdot v18.4s, v4.8h, v0.h[3]\n"
-      ".inst 0x4f41f095  // bfdot v21.4s, v4.8h, v1.h[0]\n"
-      ".inst 0x4f61f098  // bfdot v24.4s, v4.8h, v1.h[1]\n"
-      ".inst 0x4f41f89b  // bfdot v27.4s, v4.8h, v1.h[2]\n"
-      ".inst 0x4f61f89e  // bfdot v30.4s, v4.8h, v1.h[3]\n"
-      ".inst 0x4f40f0aa  // bfdot v10.4s, v5.8h, v0.h[0]\n"
-      ".inst 0x4f60f0ad  // bfdot v13.4s, v5.8h, v0.h[1]\n"
-      ".inst 0x4f40f8b0  // bfdot v16.4s, v5.8h, v0.h[2]\n"
-      ".inst 0x4f60f8b3  // bfdot v19.4s, v5.8h, v0.h[3]\n"
-      ".inst 0x4f41f0b6  // bfdot v22.4s, v5.8h, v1.h[0]\n"
-      ".inst 0x4f61f0b9  // bfdot v25.4s, v5.8h, v1.h[1]\n"
-      ".inst 0x4f41f8bc  // bfdot v28.4s, v5.8h, v1.h[2]\n"
-      ".inst 0x4f61f8bf  // bfdot v31.4s, v5.8h, v1.h[3]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "ldr q1, [x22, #0x0]\n"
+      ".inst 0x4f44f048  // bfdot v8.4s, v2.8h, v4.h[0]\n"
+      "ldr q0, [x21, #0x0]\n"
+      ".inst 0x4f64f04b  // bfdot v11.4s, v2.8h, v4.h[1]\n"
+      ".inst 0x4f44f84e  // bfdot v14.4s, v2.8h, v4.h[2]\n"
+      ".inst 0x4f64f851  // bfdot v17.4s, v2.8h, v4.h[3]\n"
+      ".inst 0x4f43f054  // bfdot v20.4s, v2.8h, v3.h[0]\n"
+      ".inst 0x4f63f057  // bfdot v23.4s, v2.8h, v3.h[1]\n"
+      ".inst 0x4f43f85a  // bfdot v26.4s, v2.8h, v3.h[2]\n"
+      ".inst 0x4f63f85d  // bfdot v29.4s, v2.8h, v3.h[3]\n"
+      ".inst 0x4f44f029  // bfdot v9.4s, v1.8h, v4.h[0]\n"
+      ".inst 0x4f64f02c  // bfdot v12.4s, v1.8h, v4.h[1]\n"
+      ".inst 0x4f44f82f  // bfdot v15.4s, v1.8h, v4.h[2]\n"
+      ".inst 0x4f64f832  // bfdot v18.4s, v1.8h, v4.h[3]\n"
+      ".inst 0x4f43f035  // bfdot v21.4s, v1.8h, v3.h[0]\n"
+      ".inst 0x4f63f038  // bfdot v24.4s, v1.8h, v3.h[1]\n"
+      ".inst 0x4f43f83b  // bfdot v27.4s, v1.8h, v3.h[2]\n"
+      ".inst 0x4f63f83e  // bfdot v30.4s, v1.8h, v3.h[3]\n"
+      ".inst 0x4f44f00a  // bfdot v10.4s, v0.8h, v4.h[0]\n"
+      ".inst 0x4f64f00d  // bfdot v13.4s, v0.8h, v4.h[1]\n"
+      ".inst 0x4f44f810  // bfdot v16.4s, v0.8h, v4.h[2]\n"
+      ".inst 0x4f64f813  // bfdot v19.4s, v0.8h, v4.h[3]\n"
+      ".inst 0x4f43f016  // bfdot v22.4s, v0.8h, v3.h[0]\n"
+      ".inst 0x4f63f019  // bfdot v25.4s, v0.8h, v3.h[1]\n"
+      ".inst 0x4f43f81c  // bfdot v28.4s, v0.8h, v3.h[2]\n"
+      ".inst 0x4f63f81f  // bfdot v31.4s, v0.8h, v3.h[3]\n"
       "6:"  // multiply loop done
-      "subs x24, x24, #0xc\n"
+      "subs x25, x25, #0xc\n"
       "str q8, [%x[Cpanel], #0x0]\n"
       "str q9, [%x[Cpanel], #0x10]\n"
       "str q10, [%x[Cpanel], #0x20]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp
index c61315b..cf4d742 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp
index 4799111..4a1c1b5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp
@@ -52,37 +52,37 @@
 
     __asm__ __volatile__(
       "1:"  // Height loop
-      "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n"
-      "ldr x24, [%x[args_ptr], %[offsetof_N]]\n"
-      "str x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "mov x23, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x24, %x[Apanel]\n"
       "2:"  // Width loop
-      "ldr x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
       "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
-      "add x22, x25, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
       "add x21, x22, x20, LSL #1\n"
       "add x20, x21, x20, LSL #1\n"
       "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "cmp x24, #0x8\n"
-      "mov %x[Apanel], x23\n"
+      "cmp x25, #0x8\n"
+      "mov %x[Apanel], x24\n"
       "bgt 3f\n"
-      "cmp x24, #0x4\n"
-      "mov x21, x25\n"
+      "cmp x25, #0x4\n"
+      "mov x21, x23\n"
       "bgt 3f\n"
-      "mov x22, x25\n"
+      "mov x22, x23\n"
       "3:"  // B setup done
-      "ldr q4, [x25, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
       "ldr q0, [%x[Apanel], #0x0]\n"
       "movi v8.16b, #0x0\n"
       "ldr q1, [%x[Apanel], #0x10]\n"
-      "ldr q5, [x25, #0x10]\n"
+      "ldr q5, [x23, #0x10]\n"
       "movi v9.16b, #0x0\n"
       "ldr q2, [%x[Apanel], #0x20]\n"
       "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
       "cmp x20, #0x2\n"
       "movi v10.16b, #0x0\n"
       "movi v11.16b, #0x0\n"
-      "add x25, x25, #0x20\n"
+      "add x23, x23, #0x20\n"
       "movi v12.16b, #0x0\n"
       "movi v13.16b, #0x0\n"
       "add %x[Apanel], %x[Apanel], #0x30\n"
@@ -106,31 +106,31 @@
       "movi v31.16b, #0x0\n"
       "blt 5f\n"
       "4:"  // main loop head
-      "ldr q3, [%x[Apanel], #0x0]\n"
-      "ldr q6, [x22, #0x0]\n"
+      "ldr q6, [%x[Apanel], #0x0]\n"
+      "ldr q7, [x22, #0x0]\n"
       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      "ldr q7, [x22, #0x10]\n"
+      "ldr q3, [x22, #0x10]\n"
       ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
       ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
       ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
       ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
       "sub x20, x20, #0x2\n"
       ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e44ec7a  // bfmmla v26.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e44ecda  // bfmmla v26.4s, v6.8h, v4.8h\n"
       "ldr q4, [x21, #0x0]\n"
-      ".inst 0x6e45ec7d  // bfmmla v29.4s, v3.8h, v5.8h\n"
+      ".inst 0x6e45ecdd  // bfmmla v29.4s, v6.8h, v5.8h\n"
       "ldr q5, [x21, #0x10]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec0c  // bfmmla v12.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e47ec2f  // bfmmla v15.4s, v1.8h, v7.8h\n"
       "cmp x20, #0x2\n"
-      ".inst 0x6e47ec32  // bfmmla v18.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec58  // bfmmla v24.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e46ec7b  // bfmmla v27.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x25, #0x0]\n"
-      ".inst 0x6e47ec7e  // bfmmla v30.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x25, #0x10]\n"
+      ".inst 0x6e43ec32  // bfmmla v18.4s, v1.8h, v3.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec58  // bfmmla v24.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e47ecdb  // bfmmla v27.4s, v6.8h, v7.8h\n"
+      "ldr q7, [x23, #0x0]\n"
+      ".inst 0x6e43ecde  // bfmmla v30.4s, v6.8h, v3.8h\n"
+      "ldr q3, [x23, #0x10]\n"
       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
       ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
       "ldr q0, [%x[Apanel], #0x10]\n"
@@ -140,22 +140,22 @@
       ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
       ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
       "ldr q2, [%x[Apanel], #0x30]\n"
-      ".inst 0x6e44ec7c  // bfmmla v28.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e44ecdc  // bfmmla v28.4s, v6.8h, v4.8h\n"
       "ldr q4, [x22, #0x20]\n"
-      ".inst 0x6e45ec7f  // bfmmla v31.4s, v3.8h, v5.8h\n"
-      "ldr q3, [%x[Apanel], #0x40]\n"
+      ".inst 0x6e45ecdf  // bfmmla v31.4s, v6.8h, v5.8h\n"
+      "ldr q6, [%x[Apanel], #0x40]\n"
       "ldr q5, [x22, #0x30]\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e47ec31  // bfmmla v17.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e47ec2e  // bfmmla v14.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e43ec31  // bfmmla v17.4s, v1.8h, v3.8h\n"
       "add x22, x22, #0x40\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e46ec7a  // bfmmla v26.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x21, #0x20]\n"
-      ".inst 0x6e47ec7d  // bfmmla v29.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x21, #0x30]\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec57  // bfmmla v23.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e47ecda  // bfmmla v26.4s, v6.8h, v7.8h\n"
+      "ldr q7, [x21, #0x20]\n"
+      ".inst 0x6e43ecdd  // bfmmla v29.4s, v6.8h, v3.8h\n"
+      "ldr q3, [x21, #0x30]\n"
       ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
       ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
       ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
@@ -163,23 +163,23 @@
       "add x21, x21, #0x40\n"
       ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
       ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e44ec7b  // bfmmla v27.4s, v3.8h, v4.8h\n"
-      "ldr q4, [x25, #0x20]\n"
-      ".inst 0x6e45ec7e  // bfmmla v30.4s, v3.8h, v5.8h\n"
-      "ldr q5, [x25, #0x30]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e44ecdb  // bfmmla v27.4s, v6.8h, v4.8h\n"
+      "ldr q4, [x23, #0x20]\n"
+      ".inst 0x6e45ecde  // bfmmla v30.4s, v6.8h, v5.8h\n"
+      "ldr q5, [x23, #0x30]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
       "ldr q0, [%x[Apanel], #0x50]\n"
-      ".inst 0x6e46ec30  // bfmmla v16.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e47ec33  // bfmmla v19.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec30  // bfmmla v16.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e43ec33  // bfmmla v19.4s, v1.8h, v3.8h\n"
       "ldr q1, [%x[Apanel], #0x60]\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec59  // bfmmla v25.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
       "ldr q2, [%x[Apanel], #0x70]\n"
-      ".inst 0x6e46ec7c  // bfmmla v28.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e47ec7f  // bfmmla v31.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecdc  // bfmmla v28.4s, v6.8h, v7.8h\n"
+      ".inst 0x6e43ecdf  // bfmmla v31.4s, v6.8h, v3.8h\n"
       "add %x[Apanel], %x[Apanel], #0x80\n"
-      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
       "bge 4b\n"
       "5:"  // main loop skip
       "ldr q3, [%x[Apanel], #0x0]\n"
@@ -215,88 +215,88 @@
       ".inst 0x6e44ec7c  // bfmmla v28.4s, v3.8h, v4.8h\n"
       ".inst 0x6e45ec7f  // bfmmla v31.4s, v3.8h, v5.8h\n"
       "cbz x20, 6f\n"
-      "ldr q6, [x25, #0x0]\n"
-      "ldr q0, [%x[Apanel], #0x0]\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ldr q1, [%x[Apanel], #0x10]\n"
-      "ldr q7, [x25, #0x10]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      "ldr q2, [%x[Apanel], #0x20]\n"
-      "ldr q3, [%x[Apanel], #0x30]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      "ldr q4, [x22, #0x0]\n"
-      "ldr q5, [x22, #0x10]\n"
-      ".inst 0x6e47ec31  // bfmmla v17.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q7, [%x[Apanel], #0x0]\n"
+      ".inst 0x6e41ece8  // bfmmla v8.4s, v7.8h, v1.8h\n"
+      "ldr q6, [%x[Apanel], #0x10]\n"
+      "ldr q0, [x23, #0x10]\n"
+      ".inst 0x6e40eceb  // bfmmla v11.4s, v7.8h, v0.8h\n"
+      "ldr q5, [%x[Apanel], #0x20]\n"
+      "ldr q4, [%x[Apanel], #0x30]\n"
+      ".inst 0x6e41ecce  // bfmmla v14.4s, v6.8h, v1.8h\n"
+      "ldr q3, [x22, #0x0]\n"
+      "ldr q2, [x22, #0x10]\n"
+      ".inst 0x6e40ecd1  // bfmmla v17.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e41ecb4  // bfmmla v20.4s, v5.8h, v1.8h\n"
+      ".inst 0x6e40ecb7  // bfmmla v23.4s, v5.8h, v0.8h\n"
       "add %x[Apanel], %x[Apanel], #0x40\n"
-      ".inst 0x6e46ec7a  // bfmmla v26.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x21, #0x0]\n"
-      ".inst 0x6e47ec7d  // bfmmla v29.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x21, #0x10]\n"
-      ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
-      ".inst 0x6e45ec32  // bfmmla v18.4s, v1.8h, v5.8h\n"
-      ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
-      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e44ec7b  // bfmmla v27.4s, v3.8h, v4.8h\n"
-      ".inst 0x6e45ec7e  // bfmmla v30.4s, v3.8h, v5.8h\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e46ec30  // bfmmla v16.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e47ec33  // bfmmla v19.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec59  // bfmmla v25.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e46ec7c  // bfmmla v28.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e47ec7f  // bfmmla v31.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e41ec9a  // bfmmla v26.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x21, #0x0]\n"
+      ".inst 0x6e40ec9d  // bfmmla v29.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x21, #0x10]\n"
+      ".inst 0x6e43ece9  // bfmmla v9.4s, v7.8h, v3.8h\n"
+      ".inst 0x6e42ecec  // bfmmla v12.4s, v7.8h, v2.8h\n"
+      ".inst 0x6e43eccf  // bfmmla v15.4s, v6.8h, v3.8h\n"
+      ".inst 0x6e42ecd2  // bfmmla v18.4s, v6.8h, v2.8h\n"
+      ".inst 0x6e43ecb5  // bfmmla v21.4s, v5.8h, v3.8h\n"
+      ".inst 0x6e42ecb8  // bfmmla v24.4s, v5.8h, v2.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e42ec9e  // bfmmla v30.4s, v4.8h, v2.8h\n"
+      ".inst 0x6e41ecea  // bfmmla v10.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e40eced  // bfmmla v13.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e41ecd0  // bfmmla v16.4s, v6.8h, v1.8h\n"
+      ".inst 0x6e40ecd3  // bfmmla v19.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e41ecb6  // bfmmla v22.4s, v5.8h, v1.8h\n"
+      ".inst 0x6e40ecb9  // bfmmla v25.4s, v5.8h, v0.8h\n"
+      ".inst 0x6e41ec9c  // bfmmla v28.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e40ec9f  // bfmmla v31.4s, v4.8h, v0.8h\n"
       "6:"  // multiply loop done
-      "subs x24, x24, #0xc\n"
-      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "subs x25, x25, #0xc\n"
+      "uzp1 v0.2d, v8.2d, v11.2d\n"
       "uzp2 v8.2d, v8.2d, v11.2d\n"
-      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp1 v1.2d, v9.2d, v12.2d\n"
       "uzp2 v9.2d, v9.2d, v12.2d\n"
-      "str q4, [%x[Cpanel], #0x0]\n"
-      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "str q0, [%x[Cpanel], #0x0]\n"
+      "uzp1 v0.2d, v10.2d, v13.2d\n"
       "uzp2 v10.2d, v10.2d, v13.2d\n"
-      "str q11, [%x[Cpanel], #0x10]\n"
-      "str q12, [%x[Cpanel], #0x20]\n"
-      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "str q1, [%x[Cpanel], #0x10]\n"
+      "str q0, [%x[Cpanel], #0x20]\n"
+      "uzp1 v0.2d, v14.2d, v17.2d\n"
       "uzp2 v14.2d, v14.2d, v17.2d\n"
       "str q8, [%x[Cpanel], #0x30]\n"
-      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp1 v2.2d, v15.2d, v18.2d\n"
       "uzp2 v15.2d, v15.2d, v18.2d\n"
       "str q9, [%x[Cpanel], #0x40]\n"
-      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp1 v17.2d, v16.2d, v19.2d\n"
       "uzp2 v16.2d, v16.2d, v19.2d\n"
       "str q10, [%x[Cpanel], #0x50]\n"
-      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp1 v1.2d, v20.2d, v23.2d\n"
       "uzp2 v20.2d, v20.2d, v23.2d\n"
-      "str q13, [%x[Cpanel], #0x60]\n"
-      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "str q0, [%x[Cpanel], #0x60]\n"
+      "uzp1 v0.2d, v21.2d, v24.2d\n"
       "uzp2 v21.2d, v21.2d, v24.2d\n"
-      "str q17, [%x[Cpanel], #0x70]\n"
-      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "str q2, [%x[Cpanel], #0x70]\n"
+      "uzp1 v23.2d, v22.2d, v25.2d\n"
       "uzp2 v22.2d, v22.2d, v25.2d\n"
-      "str q18, [%x[Cpanel], #0x80]\n"
-      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "str q17, [%x[Cpanel], #0x80]\n"
+      "uzp1 v19.2d, v26.2d, v29.2d\n"
       "uzp2 v26.2d, v26.2d, v29.2d\n"
       "str q14, [%x[Cpanel], #0x90]\n"
-      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp1 v18.2d, v27.2d, v30.2d\n"
       "uzp2 v27.2d, v27.2d, v30.2d\n"
       "str q15, [%x[Cpanel], #0xa0]\n"
-      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp1 v17.2d, v28.2d, v31.2d\n"
       "uzp2 v28.2d, v28.2d, v31.2d\n"
       "str q16, [%x[Cpanel], #0xb0]\n"
-      "str q19, [%x[Cpanel], #0xc0]\n"
-      "str q23, [%x[Cpanel], #0xd0]\n"
-      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q1, [%x[Cpanel], #0xc0]\n"
+      "str q0, [%x[Cpanel], #0xd0]\n"
+      "str q23, [%x[Cpanel], #0xe0]\n"
       "str q20, [%x[Cpanel], #0xf0]\n"
       "str q21, [%x[Cpanel], #0x100]\n"
       "str q22, [%x[Cpanel], #0x110]\n"
-      "str q25, [%x[Cpanel], #0x120]\n"
-      "str q29, [%x[Cpanel], #0x130]\n"
-      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q19, [%x[Cpanel], #0x120]\n"
+      "str q18, [%x[Cpanel], #0x130]\n"
+      "str q17, [%x[Cpanel], #0x140]\n"
       "str q26, [%x[Cpanel], #0x150]\n"
       "str q27, [%x[Cpanel], #0x160]\n"
       "str q28, [%x[Cpanel], #0x170]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp
index 1495306..b9b4ad5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
index 36bfccf..1e3f2f3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp
@@ -51,27 +51,27 @@
 
     __asm__ __volatile__(
       "1:"  // Height loop
-      "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n"
-      "ldr x24, [%x[args_ptr], %[offsetof_N]]\n"
-      "str x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "mov x23, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x24, %x[Apanel]\n"
       "2:"  // Width loop
-      "ldr x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
       "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
-      "add x22, x25, x20, LSL #1\n"
+      "add x22, x23, x20, LSL #1\n"
       "add x21, x22, x20, LSL #1\n"
       "add x20, x21, x20, LSL #1\n"
       "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "cmp x24, #0x10\n"
-      "mov %x[Apanel], x23\n"
+      "cmp x25, #0x10\n"
+      "mov %x[Apanel], x24\n"
       "bgt 3f\n"
-      "cmp x24, #0x8\n"
-      "mov x21, x25\n"
+      "cmp x25, #0x8\n"
+      "mov x21, x23\n"
       "bgt 3f\n"
-      "mov x22, x25\n"
+      "mov x22, x23\n"
       "3:"  // B setup done
       "ldr q0, [%x[Apanel], #0x0]\n"
-      "ldr q2, [x25, #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
       "movi v8.16b, #0x0\n"
       "ldr q3, [x22, #0x0]\n"
       "ldr q4, [x21, #0x0]\n"
@@ -102,11 +102,11 @@
       "movi v31.16b, #0x0\n"
       "blt 5f\n"
       "4:"  // main loop head
-      "ldr q1, [%x[Apanel], #0x10]\n"
-      "ldr q5, [x25, #0x10]\n"
+      "ldr q7, [%x[Apanel], #0x10]\n"
+      "ldr q6, [x23, #0x10]\n"
       "fmla v8.8h, v2.8h, v0.h[0]\n"
-      "ldr q6, [x22, #0x10]\n"
-      "ldr q7, [x21, #0x10]\n"
+      "ldr q5, [x22, #0x10]\n"
+      "ldr q1, [x21, #0x10]\n"
       "fmla v11.8h, v2.8h, v0.h[1]\n"
       "fmla v14.8h, v2.8h, v0.h[2]\n"
       "fmla v17.8h, v2.8h, v0.h[3]\n"
@@ -119,8 +119,8 @@
       "add %x[Apanel], %x[Apanel], #0x20\n"
       "fmla v9.8h, v3.8h, v0.h[0]\n"
       "fmla v12.8h, v3.8h, v0.h[1]\n"
-      "add x25, x25, #0x20\n"
-      "ldr q2, [x25, #0x0]\n"
+      "add x23, x23, #0x20\n"
+      "ldr q2, [x23, #0x0]\n"
       "fmla v15.8h, v3.8h, v0.h[2]\n"
       "fmla v18.8h, v3.8h, v0.h[3]\n"
       "fmla v21.8h, v3.8h, v0.h[4]\n"
@@ -140,30 +140,30 @@
       "fmla v31.8h, v4.8h, v0.h[7]\n"
       "ldr q0, [%x[Apanel], #0x0]\n"
       "ldr q4, [x21, #0x0]\n"
-      "fmla v8.8h, v5.8h, v1.h[0]\n"
-      "fmla v11.8h, v5.8h, v1.h[1]\n"
-      "fmla v14.8h, v5.8h, v1.h[2]\n"
-      "fmla v17.8h, v5.8h, v1.h[3]\n"
-      "fmla v20.8h, v5.8h, v1.h[4]\n"
-      "fmla v23.8h, v5.8h, v1.h[5]\n"
-      "fmla v26.8h, v5.8h, v1.h[6]\n"
-      "fmla v29.8h, v5.8h, v1.h[7]\n"
-      "fmla v9.8h, v6.8h, v1.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v15.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v1.h[3]\n"
-      "fmla v21.8h, v6.8h, v1.h[4]\n"
-      "fmla v24.8h, v6.8h, v1.h[5]\n"
-      "fmla v27.8h, v6.8h, v1.h[6]\n"
-      "fmla v30.8h, v6.8h, v1.h[7]\n"
-      "fmla v10.8h, v7.8h, v1.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v16.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v1.h[3]\n"
-      "fmla v22.8h, v7.8h, v1.h[4]\n"
-      "fmla v25.8h, v7.8h, v1.h[5]\n"
-      "fmla v28.8h, v7.8h, v1.h[6]\n"
-      "fmla v31.8h, v7.8h, v1.h[7]\n"
+      "fmla v8.8h, v6.8h, v7.h[0]\n"
+      "fmla v11.8h, v6.8h, v7.h[1]\n"
+      "fmla v14.8h, v6.8h, v7.h[2]\n"
+      "fmla v17.8h, v6.8h, v7.h[3]\n"
+      "fmla v20.8h, v6.8h, v7.h[4]\n"
+      "fmla v23.8h, v6.8h, v7.h[5]\n"
+      "fmla v26.8h, v6.8h, v7.h[6]\n"
+      "fmla v29.8h, v6.8h, v7.h[7]\n"
+      "fmla v9.8h, v5.8h, v7.h[0]\n"
+      "fmla v12.8h, v5.8h, v7.h[1]\n"
+      "fmla v15.8h, v5.8h, v7.h[2]\n"
+      "fmla v18.8h, v5.8h, v7.h[3]\n"
+      "fmla v21.8h, v5.8h, v7.h[4]\n"
+      "fmla v24.8h, v5.8h, v7.h[5]\n"
+      "fmla v27.8h, v5.8h, v7.h[6]\n"
+      "fmla v30.8h, v5.8h, v7.h[7]\n"
+      "fmla v10.8h, v1.8h, v7.h[0]\n"
+      "fmla v13.8h, v1.8h, v7.h[1]\n"
+      "fmla v16.8h, v1.8h, v7.h[2]\n"
+      "fmla v19.8h, v1.8h, v7.h[3]\n"
+      "fmla v22.8h, v1.8h, v7.h[4]\n"
+      "fmla v25.8h, v1.8h, v7.h[5]\n"
+      "fmla v28.8h, v1.8h, v7.h[6]\n"
+      "fmla v31.8h, v1.8h, v7.h[7]\n"
       "bge 4b\n"
       "5:"  // main loop skip
       "fmla v8.8h, v2.8h, v0.h[0]\n"
@@ -171,7 +171,7 @@
       "add %x[Apanel], %x[Apanel], #0x10\n"
       "fmla v14.8h, v2.8h, v0.h[2]\n"
       "fmla v17.8h, v2.8h, v0.h[3]\n"
-      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla v20.8h, v2.8h, v0.h[4]\n"
       "fmla v23.8h, v2.8h, v0.h[5]\n"
       "add x22, x22, #0x10\n"
@@ -195,37 +195,37 @@
       "fmla v28.8h, v4.8h, v0.h[6]\n"
       "fmla v31.8h, v4.8h, v0.h[7]\n"
       "cbz x20, 6f\n"
-      "ldr q0, [%x[Apanel], #0x0]\n"
-      "ldr q5, [x25, #0x0]\n"
-      "fmla v8.8h, v5.8h, v0.h[0]\n"
-      "ldr q6, [x22, #0x0]\n"
-      "ldr q7, [x21, #0x0]\n"
-      "fmla v11.8h, v5.8h, v0.h[1]\n"
-      "fmla v14.8h, v5.8h, v0.h[2]\n"
-      "fmla v17.8h, v5.8h, v0.h[3]\n"
+      "ldr q3, [%x[Apanel], #0x0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "fmla v8.8h, v2.8h, v3.h[0]\n"
+      "ldr q1, [x22, #0x0]\n"
+      "ldr q0, [x21, #0x0]\n"
+      "fmla v11.8h, v2.8h, v3.h[1]\n"
+      "fmla v14.8h, v2.8h, v3.h[2]\n"
+      "fmla v17.8h, v2.8h, v3.h[3]\n"
       "add %x[Apanel], %x[Apanel], #0x10\n"
-      "fmla v20.8h, v5.8h, v0.h[4]\n"
-      "fmla v23.8h, v5.8h, v0.h[5]\n"
-      "fmla v26.8h, v5.8h, v0.h[6]\n"
-      "fmla v29.8h, v5.8h, v0.h[7]\n"
-      "fmla v9.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v0.h[1]\n"
-      "fmla v15.8h, v6.8h, v0.h[2]\n"
-      "fmla v18.8h, v6.8h, v0.h[3]\n"
-      "fmla v21.8h, v6.8h, v0.h[4]\n"
-      "fmla v24.8h, v6.8h, v0.h[5]\n"
-      "fmla v27.8h, v6.8h, v0.h[6]\n"
-      "fmla v30.8h, v6.8h, v0.h[7]\n"
-      "fmla v10.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v0.h[1]\n"
-      "fmla v16.8h, v7.8h, v0.h[2]\n"
-      "fmla v19.8h, v7.8h, v0.h[3]\n"
-      "fmla v22.8h, v7.8h, v0.h[4]\n"
-      "fmla v25.8h, v7.8h, v0.h[5]\n"
-      "fmla v28.8h, v7.8h, v0.h[6]\n"
-      "fmla v31.8h, v7.8h, v0.h[7]\n"
+      "fmla v20.8h, v2.8h, v3.h[4]\n"
+      "fmla v23.8h, v2.8h, v3.h[5]\n"
+      "fmla v26.8h, v2.8h, v3.h[6]\n"
+      "fmla v29.8h, v2.8h, v3.h[7]\n"
+      "fmla v9.8h, v1.8h, v3.h[0]\n"
+      "fmla v12.8h, v1.8h, v3.h[1]\n"
+      "fmla v15.8h, v1.8h, v3.h[2]\n"
+      "fmla v18.8h, v1.8h, v3.h[3]\n"
+      "fmla v21.8h, v1.8h, v3.h[4]\n"
+      "fmla v24.8h, v1.8h, v3.h[5]\n"
+      "fmla v27.8h, v1.8h, v3.h[6]\n"
+      "fmla v30.8h, v1.8h, v3.h[7]\n"
+      "fmla v10.8h, v0.8h, v3.h[0]\n"
+      "fmla v13.8h, v0.8h, v3.h[1]\n"
+      "fmla v16.8h, v0.8h, v3.h[2]\n"
+      "fmla v19.8h, v0.8h, v3.h[3]\n"
+      "fmla v22.8h, v0.8h, v3.h[4]\n"
+      "fmla v25.8h, v0.8h, v3.h[5]\n"
+      "fmla v28.8h, v0.8h, v3.h[6]\n"
+      "fmla v31.8h, v0.8h, v3.h[7]\n"
       "6:"  // multiply loop done
-      "subs x24, x24, #0x18\n"
+      "subs x25, x25, #0x18\n"
       "str q8, [%x[Cpanel], #0x0]\n"
       "str q9, [%x[Cpanel], #0x10]\n"
       "str q10, [%x[Cpanel], #0x20]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp
index f2a836c..c4445ba 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
index ec99d64..6de0a38 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp
@@ -51,29 +51,29 @@
 
     __asm__ __volatile__(
       "1:"  // Height loop
-      "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n"
-      "ldr x24, [%x[args_ptr], %[offsetof_N]]\n"
-      "str x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "mov x23, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x24, %x[Apanel]\n"
       "2:"  // Width loop
-      "ldr x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
       "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
-      "add x22, x25, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "add x21, x22, x20, LSL #2\n"
       "add x20, x21, x20, LSL #2\n"
       "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "cmp x24, #0x8\n"
-      "mov %x[Apanel], x23\n"
+      "cmp x25, #0x8\n"
+      "mov %x[Apanel], x24\n"
       "bgt 3f\n"
-      "cmp x24, #0x4\n"
-      "mov x21, x25\n"
+      "cmp x25, #0x4\n"
+      "mov x21, x23\n"
       "bgt 3f\n"
-      "mov x22, x25\n"
+      "mov x22, x23\n"
       "3:"  // B setup done
       "ldr q0, [%x[Apanel], #0x0]\n"
       "ldr q1, [%x[Apanel], #0x10]\n"
       "movi v8.16b, #0x0\n"
-      "ldr q4, [x25, #0x0]\n"
+      "ldr q4, [x23, #0x0]\n"
       "ldr q5, [x22, #0x0]\n"
       "movi v9.16b, #0x0\n"
       "ldr q6, [x21, #0x0]\n"
@@ -103,10 +103,10 @@
       "movi v31.16b, #0x0\n"
       "blt 5f\n"
       "4:"  // main loop head
-      "ldr q2, [%x[Apanel], #0x20]\n"
-      "ldr q3, [%x[Apanel], #0x30]\n"
+      "ldr q3, [%x[Apanel], #0x20]\n"
+      "ldr q7, [%x[Apanel], #0x30]\n"
       "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "ldr q7, [x25, #0x10]\n"
+      "ldr q2, [x23, #0x10]\n"
       "fmla v11.4s, v4.4s, v0.s[1]\n"
       "fmla v14.4s, v4.4s, v0.s[2]\n"
       "fmla v17.4s, v4.4s, v0.s[3]\n"
@@ -136,36 +136,36 @@
       "fmla v28.4s, v6.4s, v1.s[2]\n"
       "fmla v31.4s, v6.4s, v1.s[3]\n"
       "ldr q1, [%x[Apanel], #0x50]\n"
-      "ldr q6, [x25, #0x20]\n"
-      "fmla v8.4s, v7.4s, v2.s[0]\n"
-      "fmla v11.4s, v7.4s, v2.s[1]\n"
-      "fmla v14.4s, v7.4s, v2.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v20.4s, v7.4s, v3.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "fmla v26.4s, v7.4s, v3.s[2]\n"
-      "fmla v29.4s, v7.4s, v3.s[3]\n"
-      "ldr q7, [x22, #0x20]\n"
-      "fmla v9.4s, v4.4s, v2.s[0]\n"
-      "fmla v12.4s, v4.4s, v2.s[1]\n"
-      "fmla v15.4s, v4.4s, v2.s[2]\n"
-      "fmla v18.4s, v4.4s, v2.s[3]\n"
-      "fmla v21.4s, v4.4s, v3.s[0]\n"
-      "fmla v24.4s, v4.4s, v3.s[1]\n"
-      "fmla v27.4s, v4.4s, v3.s[2]\n"
-      "fmla v30.4s, v4.4s, v3.s[3]\n"
+      "ldr q6, [x23, #0x20]\n"
+      "fmla v8.4s, v2.4s, v3.s[0]\n"
+      "fmla v11.4s, v2.4s, v3.s[1]\n"
+      "fmla v14.4s, v2.4s, v3.s[2]\n"
+      "fmla v17.4s, v2.4s, v3.s[3]\n"
+      "fmla v20.4s, v2.4s, v7.s[0]\n"
+      "fmla v23.4s, v2.4s, v7.s[1]\n"
+      "fmla v26.4s, v2.4s, v7.s[2]\n"
+      "fmla v29.4s, v2.4s, v7.s[3]\n"
+      "ldr q2, [x22, #0x20]\n"
+      "fmla v9.4s, v4.4s, v3.s[0]\n"
+      "fmla v12.4s, v4.4s, v3.s[1]\n"
+      "fmla v15.4s, v4.4s, v3.s[2]\n"
+      "fmla v18.4s, v4.4s, v3.s[3]\n"
+      "fmla v21.4s, v4.4s, v7.s[0]\n"
+      "fmla v24.4s, v4.4s, v7.s[1]\n"
+      "fmla v27.4s, v4.4s, v7.s[2]\n"
+      "fmla v30.4s, v4.4s, v7.s[3]\n"
       "ldr q4, [x21, #0x20]\n"
-      "fmla v10.4s, v5.4s, v2.s[0]\n"
-      "fmla v13.4s, v5.4s, v2.s[1]\n"
-      "fmla v16.4s, v5.4s, v2.s[2]\n"
-      "fmla v19.4s, v5.4s, v2.s[3]\n"
-      "ldr q2, [%x[Apanel], #0x60]\n"
-      "fmla v22.4s, v5.4s, v3.s[0]\n"
-      "fmla v25.4s, v5.4s, v3.s[1]\n"
-      "fmla v28.4s, v5.4s, v3.s[2]\n"
-      "fmla v31.4s, v5.4s, v3.s[3]\n"
-      "ldr q3, [%x[Apanel], #0x70]\n"
-      "ldr q5, [x25, #0x30]\n"
+      "fmla v10.4s, v5.4s, v3.s[0]\n"
+      "fmla v13.4s, v5.4s, v3.s[1]\n"
+      "fmla v16.4s, v5.4s, v3.s[2]\n"
+      "fmla v19.4s, v5.4s, v3.s[3]\n"
+      "ldr q3, [%x[Apanel], #0x60]\n"
+      "fmla v22.4s, v5.4s, v7.s[0]\n"
+      "fmla v25.4s, v5.4s, v7.s[1]\n"
+      "fmla v28.4s, v5.4s, v7.s[2]\n"
+      "fmla v31.4s, v5.4s, v7.s[3]\n"
+      "ldr q7, [%x[Apanel], #0x70]\n"
+      "ldr q5, [x23, #0x30]\n"
       "fmla v8.4s, v6.4s, v0.s[0]\n"
       "fmla v11.4s, v6.4s, v0.s[1]\n"
       "fmla v14.4s, v6.4s, v0.s[2]\n"
@@ -173,20 +173,20 @@
       "add %x[Apanel], %x[Apanel], #0x80\n"
       "fmla v20.4s, v6.4s, v1.s[0]\n"
       "fmla v23.4s, v6.4s, v1.s[1]\n"
-      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
       "fmla v26.4s, v6.4s, v1.s[2]\n"
       "fmla v29.4s, v6.4s, v1.s[3]\n"
       "ldr q6, [x22, #0x30]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v12.4s, v7.4s, v0.s[1]\n"
+      "fmla v9.4s, v2.4s, v0.s[0]\n"
+      "fmla v12.4s, v2.4s, v0.s[1]\n"
       "add x22, x22, #0x40\n"
-      "fmla v15.4s, v7.4s, v0.s[2]\n"
-      "fmla v18.4s, v7.4s, v0.s[3]\n"
-      "fmla v21.4s, v7.4s, v1.s[0]\n"
-      "fmla v24.4s, v7.4s, v1.s[1]\n"
-      "fmla v27.4s, v7.4s, v1.s[2]\n"
-      "fmla v30.4s, v7.4s, v1.s[3]\n"
-      "ldr q7, [x21, #0x30]\n"
+      "fmla v15.4s, v2.4s, v0.s[2]\n"
+      "fmla v18.4s, v2.4s, v0.s[3]\n"
+      "fmla v21.4s, v2.4s, v1.s[0]\n"
+      "fmla v24.4s, v2.4s, v1.s[1]\n"
+      "fmla v27.4s, v2.4s, v1.s[2]\n"
+      "fmla v30.4s, v2.4s, v1.s[3]\n"
+      "ldr q2, [x21, #0x30]\n"
       "fmla v10.4s, v4.4s, v0.s[0]\n"
       "fmla v13.4s, v4.4s, v0.s[1]\n"
       "add x21, x21, #0x40\n"
@@ -198,33 +198,33 @@
       "fmla v28.4s, v4.4s, v1.s[2]\n"
       "fmla v31.4s, v4.4s, v1.s[3]\n"
       "ldr q1, [%x[Apanel], #0x10]\n"
-      "ldr q4, [x25, #0x0]\n"
-      "fmla v8.4s, v5.4s, v2.s[0]\n"
-      "fmla v11.4s, v5.4s, v2.s[1]\n"
-      "fmla v14.4s, v5.4s, v2.s[2]\n"
-      "fmla v17.4s, v5.4s, v2.s[3]\n"
-      "fmla v20.4s, v5.4s, v3.s[0]\n"
-      "fmla v23.4s, v5.4s, v3.s[1]\n"
-      "fmla v26.4s, v5.4s, v3.s[2]\n"
-      "fmla v29.4s, v5.4s, v3.s[3]\n"
+      "ldr q4, [x23, #0x0]\n"
+      "fmla v8.4s, v5.4s, v3.s[0]\n"
+      "fmla v11.4s, v5.4s, v3.s[1]\n"
+      "fmla v14.4s, v5.4s, v3.s[2]\n"
+      "fmla v17.4s, v5.4s, v3.s[3]\n"
+      "fmla v20.4s, v5.4s, v7.s[0]\n"
+      "fmla v23.4s, v5.4s, v7.s[1]\n"
+      "fmla v26.4s, v5.4s, v7.s[2]\n"
+      "fmla v29.4s, v5.4s, v7.s[3]\n"
       "ldr q5, [x22, #0x0]\n"
-      "fmla v9.4s, v6.4s, v2.s[0]\n"
-      "fmla v12.4s, v6.4s, v2.s[1]\n"
-      "fmla v15.4s, v6.4s, v2.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v21.4s, v6.4s, v3.s[0]\n"
-      "fmla v24.4s, v6.4s, v3.s[1]\n"
-      "fmla v27.4s, v6.4s, v3.s[2]\n"
-      "fmla v30.4s, v6.4s, v3.s[3]\n"
+      "fmla v9.4s, v6.4s, v3.s[0]\n"
+      "fmla v12.4s, v6.4s, v3.s[1]\n"
+      "fmla v15.4s, v6.4s, v3.s[2]\n"
+      "fmla v18.4s, v6.4s, v3.s[3]\n"
+      "fmla v21.4s, v6.4s, v7.s[0]\n"
+      "fmla v24.4s, v6.4s, v7.s[1]\n"
+      "fmla v27.4s, v6.4s, v7.s[2]\n"
+      "fmla v30.4s, v6.4s, v7.s[3]\n"
       "ldr q6, [x21, #0x0]\n"
-      "fmla v10.4s, v7.4s, v2.s[0]\n"
-      "fmla v13.4s, v7.4s, v2.s[1]\n"
-      "fmla v16.4s, v7.4s, v2.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "fmla v22.4s, v7.4s, v3.s[0]\n"
-      "fmla v25.4s, v7.4s, v3.s[1]\n"
-      "fmla v28.4s, v7.4s, v3.s[2]\n"
-      "fmla v31.4s, v7.4s, v3.s[3]\n"
+      "fmla v10.4s, v2.4s, v3.s[0]\n"
+      "fmla v13.4s, v2.4s, v3.s[1]\n"
+      "fmla v16.4s, v2.4s, v3.s[2]\n"
+      "fmla v19.4s, v2.4s, v3.s[3]\n"
+      "fmla v22.4s, v2.4s, v7.s[0]\n"
+      "fmla v25.4s, v2.4s, v7.s[1]\n"
+      "fmla v28.4s, v2.4s, v7.s[2]\n"
+      "fmla v31.4s, v2.4s, v7.s[3]\n"
       "bge 4b\n"
       "5:"  // main loop skip
       "fmla v8.4s, v4.4s, v0.s[0]\n"
@@ -232,7 +232,7 @@
       "add %x[Apanel], %x[Apanel], #0x20\n"
       "fmla v14.4s, v4.4s, v0.s[2]\n"
       "fmla v17.4s, v4.4s, v0.s[3]\n"
-      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla v20.4s, v4.4s, v1.s[0]\n"
       "fmla v23.4s, v4.4s, v1.s[1]\n"
       "add x22, x22, #0x10\n"
@@ -257,43 +257,43 @@
       "fmla v31.4s, v6.4s, v1.s[3]\n"
       "cbz x20, 7f\n"
       "6:"  // odd loop
-      "ldr q0, [%x[Apanel], #0x0]\n"
-      "ldr q1, [%x[Apanel], #0x10]\n"
+      "ldr q4, [%x[Apanel], #0x0]\n"
+      "ldr q3, [%x[Apanel], #0x10]\n"
       "subs x20, x20, #0x1\n"
-      "ldr q7, [x25, #0x0]\n"
-      "ldr q4, [x22, #0x0]\n"
-      "fmla v8.4s, v7.4s, v0.s[0]\n"
-      "ldr q5, [x21, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v14.4s, v7.4s, v0.s[2]\n"
-      "fmla v17.4s, v7.4s, v0.s[3]\n"
-      "fmla v20.4s, v7.4s, v1.s[0]\n"
+      "ldr q2, [x23, #0x0]\n"
+      "ldr q1, [x22, #0x0]\n"
+      "fmla v8.4s, v2.4s, v4.s[0]\n"
+      "ldr q0, [x21, #0x0]\n"
+      "fmla v11.4s, v2.4s, v4.s[1]\n"
+      "fmla v14.4s, v2.4s, v4.s[2]\n"
+      "fmla v17.4s, v2.4s, v4.s[3]\n"
+      "fmla v20.4s, v2.4s, v3.s[0]\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "fmla v23.4s, v7.4s, v1.s[1]\n"
-      "fmla v26.4s, v7.4s, v1.s[2]\n"
-      "add x25, x25, #0x10\n"
-      "fmla v29.4s, v7.4s, v1.s[3]\n"
-      "fmla v9.4s, v4.4s, v0.s[0]\n"
+      "fmla v23.4s, v2.4s, v3.s[1]\n"
+      "fmla v26.4s, v2.4s, v3.s[2]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v29.4s, v2.4s, v3.s[3]\n"
+      "fmla v9.4s, v1.4s, v4.s[0]\n"
       "add x22, x22, #0x10\n"
-      "fmla v12.4s, v4.4s, v0.s[1]\n"
-      "fmla v15.4s, v4.4s, v0.s[2]\n"
+      "fmla v12.4s, v1.4s, v4.s[1]\n"
+      "fmla v15.4s, v1.4s, v4.s[2]\n"
       "add x21, x21, #0x10\n"
-      "fmla v18.4s, v4.4s, v0.s[3]\n"
-      "fmla v21.4s, v4.4s, v1.s[0]\n"
-      "fmla v24.4s, v4.4s, v1.s[1]\n"
-      "fmla v27.4s, v4.4s, v1.s[2]\n"
-      "fmla v30.4s, v4.4s, v1.s[3]\n"
-      "fmla v10.4s, v5.4s, v0.s[0]\n"
-      "fmla v13.4s, v5.4s, v0.s[1]\n"
-      "fmla v16.4s, v5.4s, v0.s[2]\n"
-      "fmla v19.4s, v5.4s, v0.s[3]\n"
-      "fmla v22.4s, v5.4s, v1.s[0]\n"
-      "fmla v25.4s, v5.4s, v1.s[1]\n"
-      "fmla v28.4s, v5.4s, v1.s[2]\n"
-      "fmla v31.4s, v5.4s, v1.s[3]\n"
+      "fmla v18.4s, v1.4s, v4.s[3]\n"
+      "fmla v21.4s, v1.4s, v3.s[0]\n"
+      "fmla v24.4s, v1.4s, v3.s[1]\n"
+      "fmla v27.4s, v1.4s, v3.s[2]\n"
+      "fmla v30.4s, v1.4s, v3.s[3]\n"
+      "fmla v10.4s, v0.4s, v4.s[0]\n"
+      "fmla v13.4s, v0.4s, v4.s[1]\n"
+      "fmla v16.4s, v0.4s, v4.s[2]\n"
+      "fmla v19.4s, v0.4s, v4.s[3]\n"
+      "fmla v22.4s, v0.4s, v3.s[0]\n"
+      "fmla v25.4s, v0.4s, v3.s[1]\n"
+      "fmla v28.4s, v0.4s, v3.s[2]\n"
+      "fmla v31.4s, v0.4s, v3.s[3]\n"
       "bne 6b\n"
       "7:"  // multiply loop done
-      "subs x24, x24, #0xc\n"
+      "subs x25, x25, #0xc\n"
       "str q8, [%x[Cpanel], #0x0]\n"
       "str q9, [%x[Cpanel], #0x10]\n"
       "str q10, [%x[Cpanel], #0x20]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
index 3b8770e..f142766 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../bfloat.hpp"
 #include "../performance_parameters.hpp"
@@ -99,5 +99,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
index 02d2434..fc323ea 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
@@ -93,7 +93,6 @@
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 176f\n"
@@ -190,11 +189,11 @@
       "15:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 16f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 17f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -211,37 +210,37 @@
       "blt 19f\n"
       "18:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f40f22a  // bfdot v10.4s, v17.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x4f40f20b  // bfdot v11.4s, v16.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      ".inst 0x4f60f228  // bfdot v8.4s, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      ".inst 0x4f60f209  // bfdot v9.4s, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x4f60f22a  // bfdot v10.4s, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4f60f20b  // bfdot v11.4s, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f40fa28  // bfdot v8.4s, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f40fa09  // bfdot v9.4s, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f40fa2a  // bfdot v10.4s, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f40fa0b  // bfdot v11.4s, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4f60fa28  // bfdot v8.4s, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4f60fa09  // bfdot v9.4s, v16.8h, v0.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "sub x27, x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f60fa2a  // bfdot v10.4s, v17.8h, v0.h[3]\n"
+      ".inst 0x4f60fa0b  // bfdot v11.4s, v16.8h, v0.h[3]\n"
       "ldr q0, [x26, #0x0]\n"
       "cmp x27, #0x10\n"
       "add x10, x10, #0x100\n"
@@ -251,37 +250,37 @@
       "bge 18b\n"
       "19:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f40f22a  // bfdot v10.4s, v17.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x4f40f20b  // bfdot v11.4s, v16.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      ".inst 0x4f60f228  // bfdot v8.4s, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      ".inst 0x4f60f209  // bfdot v9.4s, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x4f60f22a  // bfdot v10.4s, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4f60f20b  // bfdot v11.4s, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f40fa28  // bfdot v8.4s, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f40fa09  // bfdot v9.4s, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f40fa2a  // bfdot v10.4s, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f40fa0b  // bfdot v11.4s, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4f60fa28  // bfdot v8.4s, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4f60fa09  // bfdot v9.4s, v16.8h, v0.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x26, x26, #0x10\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f60fa2a  // bfdot v10.4s, v17.8h, v0.h[3]\n"
+      ".inst 0x4f60fa0b  // bfdot v11.4s, v16.8h, v0.h[3]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "add x10, x10, #0x100\n"
       "20:"  // Height 1: Multiply loop: Main loop skip
@@ -289,31 +288,31 @@
       "cmp x27, #0x2\n"
       "blt 22f\n"
       "21:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr s18, [x26], #0x4\n"
+      "ldr q16, [x10, #0x0]\n"
+      ".inst 0x4f52f208  // bfdot v8.4s, v16.8h, v18.h[0]\n"
       "sub x27, x27, #0x2\n"
-      "ldr q7, [x10, #0x10]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x4f52f209  // bfdot v9.4s, v16.8h, v18.h[0]\n"
       "cmp x27, #0x2\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f52f22a  // bfdot v10.4s, v17.8h, v18.h[0]\n"
+      ".inst 0x4f52f20b  // bfdot v11.4s, v16.8h, v18.h[0]\n"
       "add x10, x10, #0x40\n"
       "bge 21b\n"
       "22:"  // Height 1: Multiply loop: Skip odd blocks
       "cbz x27, 24f\n"
       "ldr h0, [x26, #0x0]\n"
       "23:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x4f40f228  // bfdot v8.4s, v17.8h, v0.h[0]\n"
+      ".inst 0x4f40f209  // bfdot v9.4s, v16.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f40f22a  // bfdot v10.4s, v17.8h, v0.h[0]\n"
+      ".inst 0x4f40f20b  // bfdot v11.4s, v16.8h, v0.h[0]\n"
       "add x10, x10, #0x40\n"
       "24:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -323,17 +322,17 @@
       "prfm pstl1keep, [x9, #0x0]\n"
       "tbz %x[flags], #1, 25f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v17.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v17.4s\n"
+      "fmin v9.4s, v9.4s, v17.4s\n"
+      "fmin v10.4s, v10.4s, v17.4s\n"
+      "fmin v11.4s, v11.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
       "25:"  // Height 1: No activation
       "cmp x11, #0x10\n"
       "bge 34f\n"
@@ -511,12 +510,12 @@
       "50:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 51f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 52f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -524,7 +523,7 @@
       "b 52f\n"
       "51:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
       "52:"  // Height 2: input setup done
       "cmp x27, #0x8\n"
       "blt 55f\n"
@@ -537,156 +536,156 @@
       "53:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
       ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       "sub x27, x27, #0x8\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q16, [x10, #0x30]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x4f40f22a  // bfdot v10.4s, v17.8h, v0.h[0]\n"
+      ".inst 0x4f41f22e  // bfdot v14.4s, v17.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x4f40f20b  // bfdot v11.4s, v16.8h, v0.h[0]\n"
+      ".inst 0x4f41f20f  // bfdot v15.4s, v16.8h, v1.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
       "cmp x27, #0x10\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x4f60f228  // bfdot v8.4s, v17.8h, v0.h[1]\n"
+      ".inst 0x4f61f22c  // bfdot v12.4s, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x4f60f209  // bfdot v9.4s, v16.8h, v0.h[1]\n"
+      ".inst 0x4f61f20d  // bfdot v13.4s, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4f60f22a  // bfdot v10.4s, v17.8h, v0.h[1]\n"
+      ".inst 0x4f61f22e  // bfdot v14.4s, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4f60f20b  // bfdot v11.4s, v16.8h, v0.h[1]\n"
+      ".inst 0x4f61f20f  // bfdot v15.4s, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f40fa28  // bfdot v8.4s, v17.8h, v0.h[2]\n"
+      ".inst 0x4f41fa2c  // bfdot v12.4s, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f40fa09  // bfdot v9.4s, v16.8h, v0.h[2]\n"
+      ".inst 0x4f41fa0d  // bfdot v13.4s, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f40fa2a  // bfdot v10.4s, v17.8h, v0.h[2]\n"
+      ".inst 0x4f41fa2e  // bfdot v14.4s, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f40fa0b  // bfdot v11.4s, v16.8h, v0.h[2]\n"
+      ".inst 0x4f41fa0f  // bfdot v15.4s, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4f60fa28  // bfdot v8.4s, v17.8h, v0.h[3]\n"
+      ".inst 0x4f61fa2c  // bfdot v12.4s, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4f60fa09  // bfdot v9.4s, v16.8h, v0.h[3]\n"
+      ".inst 0x4f61fa0d  // bfdot v13.4s, v16.8h, v1.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f60fa2a  // bfdot v10.4s, v17.8h, v0.h[3]\n"
+      ".inst 0x4f61fa2e  // bfdot v14.4s, v17.8h, v1.h[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f60fa0b  // bfdot v11.4s, v16.8h, v0.h[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f61fa0f  // bfdot v15.4s, v16.8h, v1.h[3]\n"
       "ldr q1, [x25, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 53b\n"
       "54:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
       ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       "add x26, x26, #0x10\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q16, [x10, #0x30]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x4f40f22a  // bfdot v10.4s, v17.8h, v0.h[0]\n"
+      ".inst 0x4f41f22e  // bfdot v14.4s, v17.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x4f40f20b  // bfdot v11.4s, v16.8h, v0.h[0]\n"
+      ".inst 0x4f41f20f  // bfdot v15.4s, v16.8h, v1.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x4f60f228  // bfdot v8.4s, v17.8h, v0.h[1]\n"
+      ".inst 0x4f61f22c  // bfdot v12.4s, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4f60f209  // bfdot v9.4s, v16.8h, v0.h[1]\n"
+      ".inst 0x4f61f20d  // bfdot v13.4s, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x4f60f22a  // bfdot v10.4s, v17.8h, v0.h[1]\n"
+      ".inst 0x4f61f22e  // bfdot v14.4s, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4f60f20b  // bfdot v11.4s, v16.8h, v0.h[1]\n"
+      ".inst 0x4f61f20f  // bfdot v15.4s, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f40fa28  // bfdot v8.4s, v17.8h, v0.h[2]\n"
+      ".inst 0x4f41fa2c  // bfdot v12.4s, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f40fa09  // bfdot v9.4s, v16.8h, v0.h[2]\n"
+      ".inst 0x4f41fa0d  // bfdot v13.4s, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f40fa2a  // bfdot v10.4s, v17.8h, v0.h[2]\n"
+      ".inst 0x4f41fa2e  // bfdot v14.4s, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f40fa0b  // bfdot v11.4s, v16.8h, v0.h[2]\n"
+      ".inst 0x4f41fa0f  // bfdot v15.4s, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4f60fa28  // bfdot v8.4s, v17.8h, v0.h[3]\n"
+      ".inst 0x4f61fa2c  // bfdot v12.4s, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4f60fa09  // bfdot v9.4s, v16.8h, v0.h[3]\n"
+      ".inst 0x4f61fa0d  // bfdot v13.4s, v16.8h, v1.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f60fa2a  // bfdot v10.4s, v17.8h, v0.h[3]\n"
+      ".inst 0x4f61fa2e  // bfdot v14.4s, v17.8h, v1.h[3]\n"
+      ".inst 0x4f60fa0b  // bfdot v11.4s, v16.8h, v0.h[3]\n"
+      ".inst 0x4f61fa0f  // bfdot v15.4s, v16.8h, v1.h[3]\n"
       "55:"  // Height 2: Multiply loop: Main loop skip
       "cbz x27, 59f\n"
       "cmp x27, #0x2\n"
       "blt 57f\n"
       "56:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s19, [x26], #0x4\n"
+      "ldr s18, [x25], #0x4\n"
       "sub x27, x27, #0x2\n"
       "cmp x27, #0x2\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x4f53f228  // bfdot v8.4s, v17.8h, v19.h[0]\n"
+      ".inst 0x4f52f22c  // bfdot v12.4s, v17.8h, v18.h[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x4f53f209  // bfdot v9.4s, v16.8h, v19.h[0]\n"
+      ".inst 0x4f52f20d  // bfdot v13.4s, v16.8h, v18.h[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f53f22a  // bfdot v10.4s, v17.8h, v19.h[0]\n"
+      ".inst 0x4f52f22e  // bfdot v14.4s, v17.8h, v18.h[0]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f53f20b  // bfdot v11.4s, v16.8h, v19.h[0]\n"
+      ".inst 0x4f52f20f  // bfdot v15.4s, v16.8h, v18.h[0]\n"
       "bge 56b\n"
       "57:"  // Height 2: Multiply loop: Skip odd blocks
       "cbz x27, 59f\n"
       "ldr h0, [x26, #0x0]\n"
       "ldr h1, [x25, #0x0]\n"
       "58:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x4f40f228  // bfdot v8.4s, v17.8h, v0.h[0]\n"
+      ".inst 0x4f41f22c  // bfdot v12.4s, v17.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x4f40f209  // bfdot v9.4s, v16.8h, v0.h[0]\n"
+      ".inst 0x4f41f20d  // bfdot v13.4s, v16.8h, v1.h[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f40f22a  // bfdot v10.4s, v17.8h, v0.h[0]\n"
+      ".inst 0x4f41f22e  // bfdot v14.4s, v17.8h, v1.h[0]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f40f20b  // bfdot v11.4s, v16.8h, v0.h[0]\n"
+      ".inst 0x4f41f20f  // bfdot v15.4s, v16.8h, v1.h[0]\n"
       "59:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -698,25 +697,25 @@
       "prfm pstl1keep, [x25, #0x0]\n"
       "tbz %x[flags], #1, 60f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v17.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v17.4s\n"
+      "fmin v9.4s, v9.4s, v17.4s\n"
+      "fmin v10.4s, v10.4s, v17.4s\n"
+      "fmin v11.4s, v11.4s, v17.4s\n"
+      "fmin v12.4s, v12.4s, v17.4s\n"
+      "fmin v13.4s, v13.4s, v17.4s\n"
+      "fmin v14.4s, v14.4s, v17.4s\n"
+      "fmin v15.4s, v15.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
+      "fmax v12.4s, v12.4s, v16.4s\n"
+      "fmax v13.4s, v13.4s, v16.4s\n"
+      "fmax v14.4s, v14.4s, v16.4s\n"
+      "fmax v15.4s, v15.4s, v16.4s\n"
       "60:"  // Height 2: No activation
       "cmp x11, #0x10\n"
       "bge 69f\n"
@@ -943,13 +942,13 @@
       "85:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 86f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 87f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -958,8 +957,8 @@
       "b 87f\n"
       "86:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "87:"  // Height 3: input setup done
       "cmp x27, #0x8\n"
       "blt 90f\n"
@@ -976,75 +975,75 @@
       "sub x27, x27, #0x8\n"
       "add x26, x26, #0x10\n"
       ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q21, [x10, #0x20]\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
       "add x25, x25, #0x10\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
       ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f40f2aa  // bfdot v10.4s, v21.8h, v0.h[0]\n"
+      ".inst 0x4f41f2ae  // bfdot v14.4s, v21.8h, v1.h[0]\n"
       "cmp x27, #0x10\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f42f2b2  // bfdot v18.4s, v21.8h, v2.h[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      ".inst 0x4f40f28b  // bfdot v11.4s, v20.8h, v0.h[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x4f41f28f  // bfdot v15.4s, v20.8h, v1.h[0]\n"
+      ".inst 0x4f42f293  // bfdot v19.4s, v20.8h, v2.h[0]\n"
+      "ldr q20, [x10, #0x50]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4f60f2a8  // bfdot v8.4s, v21.8h, v0.h[1]\n"
+      ".inst 0x4f61f2ac  // bfdot v12.4s, v21.8h, v1.h[1]\n"
+      ".inst 0x4f62f2b0  // bfdot v16.4s, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      ".inst 0x4f60f289  // bfdot v9.4s, v20.8h, v0.h[1]\n"
+      ".inst 0x4f61f28d  // bfdot v13.4s, v20.8h, v1.h[1]\n"
+      ".inst 0x4f62f291  // bfdot v17.4s, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      ".inst 0x4f60f2aa  // bfdot v10.4s, v21.8h, v0.h[1]\n"
+      ".inst 0x4f61f2ae  // bfdot v14.4s, v21.8h, v1.h[1]\n"
+      ".inst 0x4f62f2b2  // bfdot v18.4s, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      ".inst 0x4f60f28b  // bfdot v11.4s, v20.8h, v0.h[1]\n"
+      ".inst 0x4f61f28f  // bfdot v15.4s, v20.8h, v1.h[1]\n"
+      ".inst 0x4f62f293  // bfdot v19.4s, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      ".inst 0x4f40faa8  // bfdot v8.4s, v21.8h, v0.h[2]\n"
+      ".inst 0x4f41faac  // bfdot v12.4s, v21.8h, v1.h[2]\n"
+      ".inst 0x4f42fab0  // bfdot v16.4s, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      ".inst 0x4f40fa89  // bfdot v9.4s, v20.8h, v0.h[2]\n"
+      ".inst 0x4f41fa8d  // bfdot v13.4s, v20.8h, v1.h[2]\n"
+      ".inst 0x4f42fa91  // bfdot v17.4s, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      ".inst 0x4f40faaa  // bfdot v10.4s, v21.8h, v0.h[2]\n"
+      ".inst 0x4f41faae  // bfdot v14.4s, v21.8h, v1.h[2]\n"
+      ".inst 0x4f42fab2  // bfdot v18.4s, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      ".inst 0x4f40fa8b  // bfdot v11.4s, v20.8h, v0.h[2]\n"
+      ".inst 0x4f41fa8f  // bfdot v15.4s, v20.8h, v1.h[2]\n"
+      ".inst 0x4f42fa93  // bfdot v19.4s, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      ".inst 0x4f60faa8  // bfdot v8.4s, v21.8h, v0.h[3]\n"
+      ".inst 0x4f61faac  // bfdot v12.4s, v21.8h, v1.h[3]\n"
+      ".inst 0x4f62fab0  // bfdot v16.4s, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      ".inst 0x4f60fa89  // bfdot v9.4s, v20.8h, v0.h[3]\n"
+      ".inst 0x4f61fa8d  // bfdot v13.4s, v20.8h, v1.h[3]\n"
+      ".inst 0x4f62fa91  // bfdot v17.4s, v20.8h, v2.h[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f60faaa  // bfdot v10.4s, v21.8h, v0.h[3]\n"
+      ".inst 0x4f61faae  // bfdot v14.4s, v21.8h, v1.h[3]\n"
+      ".inst 0x4f62fab2  // bfdot v18.4s, v21.8h, v2.h[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f60fa8b  // bfdot v11.4s, v20.8h, v0.h[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f61fa8f  // bfdot v15.4s, v20.8h, v1.h[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f62fa93  // bfdot v19.4s, v20.8h, v2.h[3]\n"
       "ldr q2, [x24, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 88b\n"
@@ -1054,98 +1053,98 @@
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q21, [x10, #0x20]\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
       "add x24, x24, #0x10\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
       ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q20, [x10, #0x30]\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f40f2aa  // bfdot v10.4s, v21.8h, v0.h[0]\n"
+      ".inst 0x4f41f2ae  // bfdot v14.4s, v21.8h, v1.h[0]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f42f2b2  // bfdot v18.4s, v21.8h, v2.h[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      ".inst 0x4f40f28b  // bfdot v11.4s, v20.8h, v0.h[0]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4f41f28f  // bfdot v15.4s, v20.8h, v1.h[0]\n"
+      ".inst 0x4f42f293  // bfdot v19.4s, v20.8h, v2.h[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      ".inst 0x4f60f2a8  // bfdot v8.4s, v21.8h, v0.h[1]\n"
+      ".inst 0x4f61f2ac  // bfdot v12.4s, v21.8h, v1.h[1]\n"
+      ".inst 0x4f62f2b0  // bfdot v16.4s, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      ".inst 0x4f60f289  // bfdot v9.4s, v20.8h, v0.h[1]\n"
+      ".inst 0x4f61f28d  // bfdot v13.4s, v20.8h, v1.h[1]\n"
+      ".inst 0x4f62f291  // bfdot v17.4s, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      ".inst 0x4f60f2aa  // bfdot v10.4s, v21.8h, v0.h[1]\n"
+      ".inst 0x4f61f2ae  // bfdot v14.4s, v21.8h, v1.h[1]\n"
+      ".inst 0x4f62f2b2  // bfdot v18.4s, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      ".inst 0x4f60f28b  // bfdot v11.4s, v20.8h, v0.h[1]\n"
+      ".inst 0x4f61f28f  // bfdot v15.4s, v20.8h, v1.h[1]\n"
+      ".inst 0x4f62f293  // bfdot v19.4s, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      ".inst 0x4f40faa8  // bfdot v8.4s, v21.8h, v0.h[2]\n"
+      ".inst 0x4f41faac  // bfdot v12.4s, v21.8h, v1.h[2]\n"
+      ".inst 0x4f42fab0  // bfdot v16.4s, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      ".inst 0x4f40fa89  // bfdot v9.4s, v20.8h, v0.h[2]\n"
+      ".inst 0x4f41fa8d  // bfdot v13.4s, v20.8h, v1.h[2]\n"
+      ".inst 0x4f42fa91  // bfdot v17.4s, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      ".inst 0x4f40faaa  // bfdot v10.4s, v21.8h, v0.h[2]\n"
+      ".inst 0x4f41faae  // bfdot v14.4s, v21.8h, v1.h[2]\n"
+      ".inst 0x4f42fab2  // bfdot v18.4s, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      ".inst 0x4f40fa8b  // bfdot v11.4s, v20.8h, v0.h[2]\n"
+      ".inst 0x4f41fa8f  // bfdot v15.4s, v20.8h, v1.h[2]\n"
+      ".inst 0x4f42fa93  // bfdot v19.4s, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      ".inst 0x4f60faa8  // bfdot v8.4s, v21.8h, v0.h[3]\n"
+      ".inst 0x4f61faac  // bfdot v12.4s, v21.8h, v1.h[3]\n"
+      ".inst 0x4f62fab0  // bfdot v16.4s, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      ".inst 0x4f60fa89  // bfdot v9.4s, v20.8h, v0.h[3]\n"
+      ".inst 0x4f61fa8d  // bfdot v13.4s, v20.8h, v1.h[3]\n"
+      ".inst 0x4f62fa91  // bfdot v17.4s, v20.8h, v2.h[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f60faaa  // bfdot v10.4s, v21.8h, v0.h[3]\n"
+      ".inst 0x4f61faae  // bfdot v14.4s, v21.8h, v1.h[3]\n"
+      ".inst 0x4f62fab2  // bfdot v18.4s, v21.8h, v2.h[3]\n"
+      ".inst 0x4f60fa8b  // bfdot v11.4s, v20.8h, v0.h[3]\n"
+      ".inst 0x4f61fa8f  // bfdot v15.4s, v20.8h, v1.h[3]\n"
+      ".inst 0x4f62fa93  // bfdot v19.4s, v20.8h, v2.h[3]\n"
       "90:"  // Height 3: Multiply loop: Main loop skip
       "cbz x27, 94f\n"
       "cmp x27, #0x2\n"
       "blt 92f\n"
       "91:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
       "sub x27, x27, #0x2\n"
       "cmp x27, #0x2\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr q21, [x10, #0x0]\n"
+      ".inst 0x4f58f2a8  // bfdot v8.4s, v21.8h, v24.h[0]\n"
+      ".inst 0x4f57f2ac  // bfdot v12.4s, v21.8h, v23.h[0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      ".inst 0x4f56f2b0  // bfdot v16.4s, v21.8h, v22.h[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      ".inst 0x4f58f289  // bfdot v9.4s, v20.8h, v24.h[0]\n"
+      ".inst 0x4f57f28d  // bfdot v13.4s, v20.8h, v23.h[0]\n"
+      ".inst 0x4f56f291  // bfdot v17.4s, v20.8h, v22.h[0]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f58f2aa  // bfdot v10.4s, v21.8h, v24.h[0]\n"
+      ".inst 0x4f57f2ae  // bfdot v14.4s, v21.8h, v23.h[0]\n"
+      ".inst 0x4f56f2b2  // bfdot v18.4s, v21.8h, v22.h[0]\n"
+      ".inst 0x4f58f28b  // bfdot v11.4s, v20.8h, v24.h[0]\n"
+      ".inst 0x4f57f28f  // bfdot v15.4s, v20.8h, v23.h[0]\n"
+      ".inst 0x4f56f293  // bfdot v19.4s, v20.8h, v22.h[0]\n"
       "bge 91b\n"
       "92:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x27, 94f\n"
@@ -1153,23 +1152,23 @@
       "ldr h1, [x25, #0x0]\n"
       "ldr h2, [x24, #0x0]\n"
       "93:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q21, [x10, #0x0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      ".inst 0x4f40f2a8  // bfdot v8.4s, v21.8h, v0.h[0]\n"
+      ".inst 0x4f41f2ac  // bfdot v12.4s, v21.8h, v1.h[0]\n"
+      ".inst 0x4f42f2b0  // bfdot v16.4s, v21.8h, v2.h[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      ".inst 0x4f40f289  // bfdot v9.4s, v20.8h, v0.h[0]\n"
+      ".inst 0x4f41f28d  // bfdot v13.4s, v20.8h, v1.h[0]\n"
+      ".inst 0x4f42f291  // bfdot v17.4s, v20.8h, v2.h[0]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f40f2aa  // bfdot v10.4s, v21.8h, v0.h[0]\n"
+      ".inst 0x4f41f2ae  // bfdot v14.4s, v21.8h, v1.h[0]\n"
+      ".inst 0x4f42f2b2  // bfdot v18.4s, v21.8h, v2.h[0]\n"
+      ".inst 0x4f40f28b  // bfdot v11.4s, v20.8h, v0.h[0]\n"
+      ".inst 0x4f41f28f  // bfdot v15.4s, v20.8h, v1.h[0]\n"
+      ".inst 0x4f42f293  // bfdot v19.4s, v20.8h, v2.h[0]\n"
       "94:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1183,33 +1182,33 @@
       "prfm pstl1keep, [x24, #0x0]\n"
       "tbz %x[flags], #1, 95f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v21.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v21.4s\n"
+      "fmin v9.4s, v9.4s, v21.4s\n"
+      "fmin v10.4s, v10.4s, v21.4s\n"
+      "fmin v11.4s, v11.4s, v21.4s\n"
+      "fmin v12.4s, v12.4s, v21.4s\n"
+      "fmin v13.4s, v13.4s, v21.4s\n"
+      "fmin v14.4s, v14.4s, v21.4s\n"
+      "fmin v15.4s, v15.4s, v21.4s\n"
+      "fmin v16.4s, v16.4s, v21.4s\n"
+      "fmin v17.4s, v17.4s, v21.4s\n"
+      "fmin v18.4s, v18.4s, v21.4s\n"
+      "fmin v19.4s, v19.4s, v21.4s\n"
+      "fmax v8.4s, v8.4s, v20.4s\n"
+      "fmax v9.4s, v9.4s, v20.4s\n"
+      "fmax v10.4s, v10.4s, v20.4s\n"
+      "fmax v11.4s, v11.4s, v20.4s\n"
+      "fmax v12.4s, v12.4s, v20.4s\n"
+      "fmax v13.4s, v13.4s, v20.4s\n"
+      "fmax v14.4s, v14.4s, v20.4s\n"
+      "fmax v15.4s, v15.4s, v20.4s\n"
+      "fmax v16.4s, v16.4s, v20.4s\n"
+      "fmax v17.4s, v17.4s, v20.4s\n"
+      "fmax v18.4s, v18.4s, v20.4s\n"
+      "fmax v19.4s, v19.4s, v20.4s\n"
       "95:"  // Height 3: No activation
       "cmp x11, #0x10\n"
       "bge 104f\n"
@@ -1485,14 +1484,14 @@
       "120:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 121f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 122f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1502,9 +1501,9 @@
       "b 122f\n"
       "121:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "122:"  // Height 4: input setup done
       "cmp x27, #0x8\n"
       "blt 125f\n"
@@ -1523,7 +1522,7 @@
       "add x26, x26, #0x10\n"
       ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
       ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x20]\n"
       "add x25, x25, #0x10\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
@@ -1531,85 +1530,85 @@
       "add x23, x23, #0x10\n"
       ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
       ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q24, [x10, #0x30]\n"
       "cmp x27, #0x10\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f40f32a  // bfdot v10.4s, v25.8h, v0.h[0]\n"
+      ".inst 0x4f41f32e  // bfdot v14.4s, v25.8h, v1.h[0]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x4f42f332  // bfdot v18.4s, v25.8h, v2.h[0]\n"
+      ".inst 0x4f43f336  // bfdot v22.4s, v25.8h, v3.h[0]\n"
+      "ldr q25, [x10, #0x40]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f40f30b  // bfdot v11.4s, v24.8h, v0.h[0]\n"
+      ".inst 0x4f41f30f  // bfdot v15.4s, v24.8h, v1.h[0]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
-      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4f42f313  // bfdot v19.4s, v24.8h, v2.h[0]\n"
+      ".inst 0x4f43f317  // bfdot v23.4s, v24.8h, v3.h[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      ".inst 0x4f60f328  // bfdot v8.4s, v25.8h, v0.h[1]\n"
+      ".inst 0x4f61f32c  // bfdot v12.4s, v25.8h, v1.h[1]\n"
+      ".inst 0x4f62f330  // bfdot v16.4s, v25.8h, v2.h[1]\n"
+      ".inst 0x4f63f334  // bfdot v20.4s, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      ".inst 0x4f60f309  // bfdot v9.4s, v24.8h, v0.h[1]\n"
+      ".inst 0x4f61f30d  // bfdot v13.4s, v24.8h, v1.h[1]\n"
+      ".inst 0x4f62f311  // bfdot v17.4s, v24.8h, v2.h[1]\n"
+      ".inst 0x4f63f315  // bfdot v21.4s, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      ".inst 0x4f60f32a  // bfdot v10.4s, v25.8h, v0.h[1]\n"
+      ".inst 0x4f61f32e  // bfdot v14.4s, v25.8h, v1.h[1]\n"
+      ".inst 0x4f62f332  // bfdot v18.4s, v25.8h, v2.h[1]\n"
+      ".inst 0x4f63f336  // bfdot v22.4s, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      ".inst 0x4f60f30b  // bfdot v11.4s, v24.8h, v0.h[1]\n"
+      ".inst 0x4f61f30f  // bfdot v15.4s, v24.8h, v1.h[1]\n"
+      ".inst 0x4f62f313  // bfdot v19.4s, v24.8h, v2.h[1]\n"
+      ".inst 0x4f63f317  // bfdot v23.4s, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      ".inst 0x4f40fb28  // bfdot v8.4s, v25.8h, v0.h[2]\n"
+      ".inst 0x4f41fb2c  // bfdot v12.4s, v25.8h, v1.h[2]\n"
+      ".inst 0x4f42fb30  // bfdot v16.4s, v25.8h, v2.h[2]\n"
+      ".inst 0x4f43fb34  // bfdot v20.4s, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      ".inst 0x4f40fb09  // bfdot v9.4s, v24.8h, v0.h[2]\n"
+      ".inst 0x4f41fb0d  // bfdot v13.4s, v24.8h, v1.h[2]\n"
+      ".inst 0x4f42fb11  // bfdot v17.4s, v24.8h, v2.h[2]\n"
+      ".inst 0x4f43fb15  // bfdot v21.4s, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      ".inst 0x4f40fb2a  // bfdot v10.4s, v25.8h, v0.h[2]\n"
+      ".inst 0x4f41fb2e  // bfdot v14.4s, v25.8h, v1.h[2]\n"
+      ".inst 0x4f42fb32  // bfdot v18.4s, v25.8h, v2.h[2]\n"
+      ".inst 0x4f43fb36  // bfdot v22.4s, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      ".inst 0x4f40fb0b  // bfdot v11.4s, v24.8h, v0.h[2]\n"
+      ".inst 0x4f41fb0f  // bfdot v15.4s, v24.8h, v1.h[2]\n"
+      ".inst 0x4f42fb13  // bfdot v19.4s, v24.8h, v2.h[2]\n"
+      ".inst 0x4f43fb17  // bfdot v23.4s, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      ".inst 0x4f60fb28  // bfdot v8.4s, v25.8h, v0.h[3]\n"
+      ".inst 0x4f61fb2c  // bfdot v12.4s, v25.8h, v1.h[3]\n"
+      ".inst 0x4f62fb30  // bfdot v16.4s, v25.8h, v2.h[3]\n"
+      ".inst 0x4f63fb34  // bfdot v20.4s, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      ".inst 0x4f60fb09  // bfdot v9.4s, v24.8h, v0.h[3]\n"
+      ".inst 0x4f61fb0d  // bfdot v13.4s, v24.8h, v1.h[3]\n"
+      ".inst 0x4f62fb11  // bfdot v17.4s, v24.8h, v2.h[3]\n"
+      ".inst 0x4f63fb15  // bfdot v21.4s, v24.8h, v3.h[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f60fb2a  // bfdot v10.4s, v25.8h, v0.h[3]\n"
+      ".inst 0x4f61fb2e  // bfdot v14.4s, v25.8h, v1.h[3]\n"
+      ".inst 0x4f62fb32  // bfdot v18.4s, v25.8h, v2.h[3]\n"
+      ".inst 0x4f63fb36  // bfdot v22.4s, v25.8h, v3.h[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f60fb0b  // bfdot v11.4s, v24.8h, v0.h[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f61fb0f  // bfdot v15.4s, v24.8h, v1.h[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f62fb13  // bfdot v19.4s, v24.8h, v2.h[3]\n"
       "ldr q2, [x24, #0x0]\n"
-      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f63fb17  // bfdot v23.4s, v24.8h, v3.h[3]\n"
       "ldr q3, [x23, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 123b\n"
@@ -1620,7 +1619,7 @@
       "add x25, x25, #0x10\n"
       ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
       ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x20]\n"
       "add x24, x24, #0x10\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
@@ -1628,112 +1627,112 @@
       "sub x27, x27, #0x8\n"
       ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
       ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q24, [x10, #0x30]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f40f32a  // bfdot v10.4s, v25.8h, v0.h[0]\n"
+      ".inst 0x4f41f32e  // bfdot v14.4s, v25.8h, v1.h[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x4f42f332  // bfdot v18.4s, v25.8h, v2.h[0]\n"
+      ".inst 0x4f43f336  // bfdot v22.4s, v25.8h, v3.h[0]\n"
+      "ldr q25, [x10, #0x40]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
-      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4f40f30b  // bfdot v11.4s, v24.8h, v0.h[0]\n"
+      ".inst 0x4f41f30f  // bfdot v15.4s, v24.8h, v1.h[0]\n"
+      ".inst 0x4f42f313  // bfdot v19.4s, v24.8h, v2.h[0]\n"
+      ".inst 0x4f43f317  // bfdot v23.4s, v24.8h, v3.h[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      ".inst 0x4f60f328  // bfdot v8.4s, v25.8h, v0.h[1]\n"
+      ".inst 0x4f61f32c  // bfdot v12.4s, v25.8h, v1.h[1]\n"
+      ".inst 0x4f62f330  // bfdot v16.4s, v25.8h, v2.h[1]\n"
+      ".inst 0x4f63f334  // bfdot v20.4s, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      ".inst 0x4f60f309  // bfdot v9.4s, v24.8h, v0.h[1]\n"
+      ".inst 0x4f61f30d  // bfdot v13.4s, v24.8h, v1.h[1]\n"
+      ".inst 0x4f62f311  // bfdot v17.4s, v24.8h, v2.h[1]\n"
+      ".inst 0x4f63f315  // bfdot v21.4s, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      ".inst 0x4f60f32a  // bfdot v10.4s, v25.8h, v0.h[1]\n"
+      ".inst 0x4f61f32e  // bfdot v14.4s, v25.8h, v1.h[1]\n"
+      ".inst 0x4f62f332  // bfdot v18.4s, v25.8h, v2.h[1]\n"
+      ".inst 0x4f63f336  // bfdot v22.4s, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      ".inst 0x4f60f30b  // bfdot v11.4s, v24.8h, v0.h[1]\n"
+      ".inst 0x4f61f30f  // bfdot v15.4s, v24.8h, v1.h[1]\n"
+      ".inst 0x4f62f313  // bfdot v19.4s, v24.8h, v2.h[1]\n"
+      ".inst 0x4f63f317  // bfdot v23.4s, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      ".inst 0x4f40fb28  // bfdot v8.4s, v25.8h, v0.h[2]\n"
+      ".inst 0x4f41fb2c  // bfdot v12.4s, v25.8h, v1.h[2]\n"
+      ".inst 0x4f42fb30  // bfdot v16.4s, v25.8h, v2.h[2]\n"
+      ".inst 0x4f43fb34  // bfdot v20.4s, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      ".inst 0x4f40fb09  // bfdot v9.4s, v24.8h, v0.h[2]\n"
+      ".inst 0x4f41fb0d  // bfdot v13.4s, v24.8h, v1.h[2]\n"
+      ".inst 0x4f42fb11  // bfdot v17.4s, v24.8h, v2.h[2]\n"
+      ".inst 0x4f43fb15  // bfdot v21.4s, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      ".inst 0x4f40fb2a  // bfdot v10.4s, v25.8h, v0.h[2]\n"
+      ".inst 0x4f41fb2e  // bfdot v14.4s, v25.8h, v1.h[2]\n"
+      ".inst 0x4f42fb32  // bfdot v18.4s, v25.8h, v2.h[2]\n"
+      ".inst 0x4f43fb36  // bfdot v22.4s, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      ".inst 0x4f40fb0b  // bfdot v11.4s, v24.8h, v0.h[2]\n"
+      ".inst 0x4f41fb0f  // bfdot v15.4s, v24.8h, v1.h[2]\n"
+      ".inst 0x4f42fb13  // bfdot v19.4s, v24.8h, v2.h[2]\n"
+      ".inst 0x4f43fb17  // bfdot v23.4s, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      ".inst 0x4f60fb28  // bfdot v8.4s, v25.8h, v0.h[3]\n"
+      ".inst 0x4f61fb2c  // bfdot v12.4s, v25.8h, v1.h[3]\n"
+      ".inst 0x4f62fb30  // bfdot v16.4s, v25.8h, v2.h[3]\n"
+      ".inst 0x4f63fb34  // bfdot v20.4s, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      ".inst 0x4f60fb09  // bfdot v9.4s, v24.8h, v0.h[3]\n"
+      ".inst 0x4f61fb0d  // bfdot v13.4s, v24.8h, v1.h[3]\n"
+      ".inst 0x4f62fb11  // bfdot v17.4s, v24.8h, v2.h[3]\n"
+      ".inst 0x4f63fb15  // bfdot v21.4s, v24.8h, v3.h[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
-      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f60fb2a  // bfdot v10.4s, v25.8h, v0.h[3]\n"
+      ".inst 0x4f61fb2e  // bfdot v14.4s, v25.8h, v1.h[3]\n"
+      ".inst 0x4f62fb32  // bfdot v18.4s, v25.8h, v2.h[3]\n"
+      ".inst 0x4f63fb36  // bfdot v22.4s, v25.8h, v3.h[3]\n"
+      ".inst 0x4f60fb0b  // bfdot v11.4s, v24.8h, v0.h[3]\n"
+      ".inst 0x4f61fb0f  // bfdot v15.4s, v24.8h, v1.h[3]\n"
+      ".inst 0x4f62fb13  // bfdot v19.4s, v24.8h, v2.h[3]\n"
+      ".inst 0x4f63fb17  // bfdot v23.4s, v24.8h, v3.h[3]\n"
       "125:"  // Height 4: Multiply loop: Main loop skip
       "cbz x27, 129f\n"
       "cmp x27, #0x2\n"
       "blt 127f\n"
       "126:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
       "sub x27, x27, #0x2\n"
       "cmp x27, #0x2\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      ".inst 0x4f5df328  // bfdot v8.4s, v25.8h, v29.h[0]\n"
+      ".inst 0x4f5cf32c  // bfdot v12.4s, v25.8h, v28.h[0]\n"
+      ".inst 0x4f5bf330  // bfdot v16.4s, v25.8h, v27.h[0]\n"
+      ".inst 0x4f5af334  // bfdot v20.4s, v25.8h, v26.h[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      ".inst 0x4f5df309  // bfdot v9.4s, v24.8h, v29.h[0]\n"
+      ".inst 0x4f5cf30d  // bfdot v13.4s, v24.8h, v28.h[0]\n"
+      ".inst 0x4f5bf311  // bfdot v17.4s, v24.8h, v27.h[0]\n"
+      ".inst 0x4f5af315  // bfdot v21.4s, v24.8h, v26.h[0]\n"
+      "ldr q24, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f5df32a  // bfdot v10.4s, v25.8h, v29.h[0]\n"
+      ".inst 0x4f5cf32e  // bfdot v14.4s, v25.8h, v28.h[0]\n"
+      ".inst 0x4f5bf332  // bfdot v18.4s, v25.8h, v27.h[0]\n"
+      ".inst 0x4f5af336  // bfdot v22.4s, v25.8h, v26.h[0]\n"
+      ".inst 0x4f5df30b  // bfdot v11.4s, v24.8h, v29.h[0]\n"
+      ".inst 0x4f5cf30f  // bfdot v15.4s, v24.8h, v28.h[0]\n"
+      ".inst 0x4f5bf313  // bfdot v19.4s, v24.8h, v27.h[0]\n"
+      ".inst 0x4f5af317  // bfdot v23.4s, v24.8h, v26.h[0]\n"
       "bge 126b\n"
       "127:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x27, 129f\n"
@@ -1742,27 +1741,27 @@
       "ldr h2, [x24, #0x0]\n"
       "ldr h3, [x23, #0x0]\n"
       "128:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      ".inst 0x4f40f328  // bfdot v8.4s, v25.8h, v0.h[0]\n"
+      ".inst 0x4f41f32c  // bfdot v12.4s, v25.8h, v1.h[0]\n"
+      ".inst 0x4f42f330  // bfdot v16.4s, v25.8h, v2.h[0]\n"
+      ".inst 0x4f43f334  // bfdot v20.4s, v25.8h, v3.h[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      ".inst 0x4f40f309  // bfdot v9.4s, v24.8h, v0.h[0]\n"
+      ".inst 0x4f41f30d  // bfdot v13.4s, v24.8h, v1.h[0]\n"
+      ".inst 0x4f42f311  // bfdot v17.4s, v24.8h, v2.h[0]\n"
+      ".inst 0x4f43f315  // bfdot v21.4s, v24.8h, v3.h[0]\n"
+      "ldr q24, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f40f32a  // bfdot v10.4s, v25.8h, v0.h[0]\n"
+      ".inst 0x4f41f32e  // bfdot v14.4s, v25.8h, v1.h[0]\n"
+      ".inst 0x4f42f332  // bfdot v18.4s, v25.8h, v2.h[0]\n"
+      ".inst 0x4f43f336  // bfdot v22.4s, v25.8h, v3.h[0]\n"
+      ".inst 0x4f40f30b  // bfdot v11.4s, v24.8h, v0.h[0]\n"
+      ".inst 0x4f41f30f  // bfdot v15.4s, v24.8h, v1.h[0]\n"
+      ".inst 0x4f42f313  // bfdot v19.4s, v24.8h, v2.h[0]\n"
+      ".inst 0x4f43f317  // bfdot v23.4s, v24.8h, v3.h[0]\n"
       "129:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1778,41 +1777,41 @@
       "prfm pstl1keep, [x23, #0x0]\n"
       "tbz %x[flags], #1, 130f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v25.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmin v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v1.4s\n"
-      "fmin v22.4s, v22.4s, v1.4s\n"
-      "fmin v23.4s, v23.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
-      "fmax v20.4s, v20.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v0.4s\n"
-      "fmax v22.4s, v22.4s, v0.4s\n"
-      "fmax v23.4s, v23.4s, v0.4s\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v25.4s\n"
+      "fmin v9.4s, v9.4s, v25.4s\n"
+      "fmin v10.4s, v10.4s, v25.4s\n"
+      "fmin v11.4s, v11.4s, v25.4s\n"
+      "fmin v12.4s, v12.4s, v25.4s\n"
+      "fmin v13.4s, v13.4s, v25.4s\n"
+      "fmin v14.4s, v14.4s, v25.4s\n"
+      "fmin v15.4s, v15.4s, v25.4s\n"
+      "fmin v16.4s, v16.4s, v25.4s\n"
+      "fmin v17.4s, v17.4s, v25.4s\n"
+      "fmin v18.4s, v18.4s, v25.4s\n"
+      "fmin v19.4s, v19.4s, v25.4s\n"
+      "fmin v20.4s, v20.4s, v25.4s\n"
+      "fmin v21.4s, v21.4s, v25.4s\n"
+      "fmin v22.4s, v22.4s, v25.4s\n"
+      "fmin v23.4s, v23.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v24.4s\n"
+      "fmax v9.4s, v9.4s, v24.4s\n"
+      "fmax v10.4s, v10.4s, v24.4s\n"
+      "fmax v11.4s, v11.4s, v24.4s\n"
+      "fmax v12.4s, v12.4s, v24.4s\n"
+      "fmax v13.4s, v13.4s, v24.4s\n"
+      "fmax v14.4s, v14.4s, v24.4s\n"
+      "fmax v15.4s, v15.4s, v24.4s\n"
+      "fmax v16.4s, v16.4s, v24.4s\n"
+      "fmax v17.4s, v17.4s, v24.4s\n"
+      "fmax v18.4s, v18.4s, v24.4s\n"
+      "fmax v19.4s, v19.4s, v24.4s\n"
+      "fmax v20.4s, v20.4s, v24.4s\n"
+      "fmax v21.4s, v21.4s, v24.4s\n"
+      "fmax v22.4s, v22.4s, v24.4s\n"
+      "fmax v23.4s, v23.4s, v24.4s\n"
       "130:"  // Height 4: No activation
       "cmp x11, #0x10\n"
       "bge 139f\n"
@@ -2137,15 +2136,15 @@
       "155:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 156f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 157f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -2156,10 +2155,10 @@
       "b 157f\n"
       "156:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "157:"  // Height 5: input setup done
       "cmp x27, #0x8\n"
       "blt 160f\n"
@@ -2182,7 +2181,7 @@
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q29, [x10, #0x20]\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
@@ -2191,100 +2190,100 @@
       "cmp x27, #0x10\n"
       ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
       ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q28, [x10, #0x30]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f40f3aa  // bfdot v10.4s, v29.8h, v0.h[0]\n"
+      ".inst 0x4f41f3ae  // bfdot v14.4s, v29.8h, v1.h[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f42f3b2  // bfdot v18.4s, v29.8h, v2.h[0]\n"
+      ".inst 0x4f43f3b6  // bfdot v22.4s, v29.8h, v3.h[0]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
-      ".inst 0x4f64f0d8  // bfdot v24.4s, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
-      ".inst 0x4f64f0f9  // bfdot v25.4s, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
-      ".inst 0x4f64f0da  // bfdot v26.4s, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
-      ".inst 0x4f64f0fb  // bfdot v27.4s, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
-      ".inst 0x4f44f8d8  // bfdot v24.4s, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
-      ".inst 0x4f44f8f9  // bfdot v25.4s, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
-      ".inst 0x4f44f8da  // bfdot v26.4s, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
-      ".inst 0x4f44f8fb  // bfdot v27.4s, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
-      ".inst 0x4f64f8d8  // bfdot v24.4s, v6.8h, v4.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
-      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
-      ".inst 0x4f64f8f9  // bfdot v25.4s, v7.8h, v4.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4f44f3ba  // bfdot v26.4s, v29.8h, v4.h[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      ".inst 0x4f40f38b  // bfdot v11.4s, v28.8h, v0.h[0]\n"
+      ".inst 0x4f41f38f  // bfdot v15.4s, v28.8h, v1.h[0]\n"
+      ".inst 0x4f42f393  // bfdot v19.4s, v28.8h, v2.h[0]\n"
+      ".inst 0x4f43f397  // bfdot v23.4s, v28.8h, v3.h[0]\n"
+      ".inst 0x4f44f39b  // bfdot v27.4s, v28.8h, v4.h[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      ".inst 0x4f60f3a8  // bfdot v8.4s, v29.8h, v0.h[1]\n"
+      ".inst 0x4f61f3ac  // bfdot v12.4s, v29.8h, v1.h[1]\n"
+      ".inst 0x4f62f3b0  // bfdot v16.4s, v29.8h, v2.h[1]\n"
+      ".inst 0x4f63f3b4  // bfdot v20.4s, v29.8h, v3.h[1]\n"
+      ".inst 0x4f64f3b8  // bfdot v24.4s, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      ".inst 0x4f60f389  // bfdot v9.4s, v28.8h, v0.h[1]\n"
+      ".inst 0x4f61f38d  // bfdot v13.4s, v28.8h, v1.h[1]\n"
+      ".inst 0x4f62f391  // bfdot v17.4s, v28.8h, v2.h[1]\n"
+      ".inst 0x4f63f395  // bfdot v21.4s, v28.8h, v3.h[1]\n"
+      ".inst 0x4f64f399  // bfdot v25.4s, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      ".inst 0x4f60f3aa  // bfdot v10.4s, v29.8h, v0.h[1]\n"
+      ".inst 0x4f61f3ae  // bfdot v14.4s, v29.8h, v1.h[1]\n"
+      ".inst 0x4f62f3b2  // bfdot v18.4s, v29.8h, v2.h[1]\n"
+      ".inst 0x4f63f3b6  // bfdot v22.4s, v29.8h, v3.h[1]\n"
+      ".inst 0x4f64f3ba  // bfdot v26.4s, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      ".inst 0x4f60f38b  // bfdot v11.4s, v28.8h, v0.h[1]\n"
+      ".inst 0x4f61f38f  // bfdot v15.4s, v28.8h, v1.h[1]\n"
+      ".inst 0x4f62f393  // bfdot v19.4s, v28.8h, v2.h[1]\n"
+      ".inst 0x4f63f397  // bfdot v23.4s, v28.8h, v3.h[1]\n"
+      ".inst 0x4f64f39b  // bfdot v27.4s, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      ".inst 0x4f40fba8  // bfdot v8.4s, v29.8h, v0.h[2]\n"
+      ".inst 0x4f41fbac  // bfdot v12.4s, v29.8h, v1.h[2]\n"
+      ".inst 0x4f42fbb0  // bfdot v16.4s, v29.8h, v2.h[2]\n"
+      ".inst 0x4f43fbb4  // bfdot v20.4s, v29.8h, v3.h[2]\n"
+      ".inst 0x4f44fbb8  // bfdot v24.4s, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      ".inst 0x4f40fb89  // bfdot v9.4s, v28.8h, v0.h[2]\n"
+      ".inst 0x4f41fb8d  // bfdot v13.4s, v28.8h, v1.h[2]\n"
+      ".inst 0x4f42fb91  // bfdot v17.4s, v28.8h, v2.h[2]\n"
+      ".inst 0x4f43fb95  // bfdot v21.4s, v28.8h, v3.h[2]\n"
+      ".inst 0x4f44fb99  // bfdot v25.4s, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      ".inst 0x4f40fbaa  // bfdot v10.4s, v29.8h, v0.h[2]\n"
+      ".inst 0x4f41fbae  // bfdot v14.4s, v29.8h, v1.h[2]\n"
+      ".inst 0x4f42fbb2  // bfdot v18.4s, v29.8h, v2.h[2]\n"
+      ".inst 0x4f43fbb6  // bfdot v22.4s, v29.8h, v3.h[2]\n"
+      ".inst 0x4f44fbba  // bfdot v26.4s, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      ".inst 0x4f40fb8b  // bfdot v11.4s, v28.8h, v0.h[2]\n"
+      ".inst 0x4f41fb8f  // bfdot v15.4s, v28.8h, v1.h[2]\n"
+      ".inst 0x4f42fb93  // bfdot v19.4s, v28.8h, v2.h[2]\n"
+      ".inst 0x4f43fb97  // bfdot v23.4s, v28.8h, v3.h[2]\n"
+      ".inst 0x4f44fb9b  // bfdot v27.4s, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      ".inst 0x4f60fba8  // bfdot v8.4s, v29.8h, v0.h[3]\n"
+      ".inst 0x4f61fbac  // bfdot v12.4s, v29.8h, v1.h[3]\n"
+      ".inst 0x4f62fbb0  // bfdot v16.4s, v29.8h, v2.h[3]\n"
+      ".inst 0x4f63fbb4  // bfdot v20.4s, v29.8h, v3.h[3]\n"
+      ".inst 0x4f64fbb8  // bfdot v24.4s, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      ".inst 0x4f60fb89  // bfdot v9.4s, v28.8h, v0.h[3]\n"
+      ".inst 0x4f61fb8d  // bfdot v13.4s, v28.8h, v1.h[3]\n"
+      ".inst 0x4f62fb91  // bfdot v17.4s, v28.8h, v2.h[3]\n"
+      ".inst 0x4f63fb95  // bfdot v21.4s, v28.8h, v3.h[3]\n"
+      ".inst 0x4f64fb99  // bfdot v25.4s, v28.8h, v4.h[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
-      ".inst 0x4f64f8da  // bfdot v26.4s, v6.8h, v4.h[3]\n"
+      ".inst 0x4f60fbaa  // bfdot v10.4s, v29.8h, v0.h[3]\n"
+      ".inst 0x4f61fbae  // bfdot v14.4s, v29.8h, v1.h[3]\n"
+      ".inst 0x4f62fbb2  // bfdot v18.4s, v29.8h, v2.h[3]\n"
+      ".inst 0x4f63fbb6  // bfdot v22.4s, v29.8h, v3.h[3]\n"
+      ".inst 0x4f64fbba  // bfdot v26.4s, v29.8h, v4.h[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f60fb8b  // bfdot v11.4s, v28.8h, v0.h[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f61fb8f  // bfdot v15.4s, v28.8h, v1.h[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f62fb93  // bfdot v19.4s, v28.8h, v2.h[3]\n"
       "ldr q2, [x24, #0x0]\n"
-      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f63fb97  // bfdot v23.4s, v28.8h, v3.h[3]\n"
       "ldr q3, [x23, #0x0]\n"
-      ".inst 0x4f64f8fb  // bfdot v27.4s, v7.8h, v4.h[3]\n"
+      ".inst 0x4f64fb9b  // bfdot v27.4s, v28.8h, v4.h[3]\n"
       "ldr q4, [x22, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 158b\n"
@@ -2298,7 +2297,7 @@
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q29, [x10, #0x20]\n"
       ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
       "add x22, x22, #0x10\n"
       ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
@@ -2307,131 +2306,131 @@
       "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
       ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q28, [x10, #0x30]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f40f3aa  // bfdot v10.4s, v29.8h, v0.h[0]\n"
+      ".inst 0x4f41f3ae  // bfdot v14.4s, v29.8h, v1.h[0]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f42f3b2  // bfdot v18.4s, v29.8h, v2.h[0]\n"
+      ".inst 0x4f43f3b6  // bfdot v22.4s, v29.8h, v3.h[0]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
-      ".inst 0x4f64f0d8  // bfdot v24.4s, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
-      ".inst 0x4f64f0f9  // bfdot v25.4s, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
-      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
-      ".inst 0x4f64f0da  // bfdot v26.4s, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
-      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
-      ".inst 0x4f64f0fb  // bfdot v27.4s, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
-      ".inst 0x4f44f8d8  // bfdot v24.4s, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
-      ".inst 0x4f44f8f9  // bfdot v25.4s, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
-      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
-      ".inst 0x4f44f8da  // bfdot v26.4s, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
-      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
-      ".inst 0x4f44f8fb  // bfdot v27.4s, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
-      ".inst 0x4f64f8d8  // bfdot v24.4s, v6.8h, v4.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
-      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
-      ".inst 0x4f64f8f9  // bfdot v25.4s, v7.8h, v4.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4f44f3ba  // bfdot v26.4s, v29.8h, v4.h[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      ".inst 0x4f40f38b  // bfdot v11.4s, v28.8h, v0.h[0]\n"
+      ".inst 0x4f41f38f  // bfdot v15.4s, v28.8h, v1.h[0]\n"
+      ".inst 0x4f42f393  // bfdot v19.4s, v28.8h, v2.h[0]\n"
+      ".inst 0x4f43f397  // bfdot v23.4s, v28.8h, v3.h[0]\n"
+      ".inst 0x4f44f39b  // bfdot v27.4s, v28.8h, v4.h[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      ".inst 0x4f60f3a8  // bfdot v8.4s, v29.8h, v0.h[1]\n"
+      ".inst 0x4f61f3ac  // bfdot v12.4s, v29.8h, v1.h[1]\n"
+      ".inst 0x4f62f3b0  // bfdot v16.4s, v29.8h, v2.h[1]\n"
+      ".inst 0x4f63f3b4  // bfdot v20.4s, v29.8h, v3.h[1]\n"
+      ".inst 0x4f64f3b8  // bfdot v24.4s, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      ".inst 0x4f60f389  // bfdot v9.4s, v28.8h, v0.h[1]\n"
+      ".inst 0x4f61f38d  // bfdot v13.4s, v28.8h, v1.h[1]\n"
+      ".inst 0x4f62f391  // bfdot v17.4s, v28.8h, v2.h[1]\n"
+      ".inst 0x4f63f395  // bfdot v21.4s, v28.8h, v3.h[1]\n"
+      ".inst 0x4f64f399  // bfdot v25.4s, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      ".inst 0x4f60f3aa  // bfdot v10.4s, v29.8h, v0.h[1]\n"
+      ".inst 0x4f61f3ae  // bfdot v14.4s, v29.8h, v1.h[1]\n"
+      ".inst 0x4f62f3b2  // bfdot v18.4s, v29.8h, v2.h[1]\n"
+      ".inst 0x4f63f3b6  // bfdot v22.4s, v29.8h, v3.h[1]\n"
+      ".inst 0x4f64f3ba  // bfdot v26.4s, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      ".inst 0x4f60f38b  // bfdot v11.4s, v28.8h, v0.h[1]\n"
+      ".inst 0x4f61f38f  // bfdot v15.4s, v28.8h, v1.h[1]\n"
+      ".inst 0x4f62f393  // bfdot v19.4s, v28.8h, v2.h[1]\n"
+      ".inst 0x4f63f397  // bfdot v23.4s, v28.8h, v3.h[1]\n"
+      ".inst 0x4f64f39b  // bfdot v27.4s, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      ".inst 0x4f40fba8  // bfdot v8.4s, v29.8h, v0.h[2]\n"
+      ".inst 0x4f41fbac  // bfdot v12.4s, v29.8h, v1.h[2]\n"
+      ".inst 0x4f42fbb0  // bfdot v16.4s, v29.8h, v2.h[2]\n"
+      ".inst 0x4f43fbb4  // bfdot v20.4s, v29.8h, v3.h[2]\n"
+      ".inst 0x4f44fbb8  // bfdot v24.4s, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      ".inst 0x4f40fb89  // bfdot v9.4s, v28.8h, v0.h[2]\n"
+      ".inst 0x4f41fb8d  // bfdot v13.4s, v28.8h, v1.h[2]\n"
+      ".inst 0x4f42fb91  // bfdot v17.4s, v28.8h, v2.h[2]\n"
+      ".inst 0x4f43fb95  // bfdot v21.4s, v28.8h, v3.h[2]\n"
+      ".inst 0x4f44fb99  // bfdot v25.4s, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      ".inst 0x4f40fbaa  // bfdot v10.4s, v29.8h, v0.h[2]\n"
+      ".inst 0x4f41fbae  // bfdot v14.4s, v29.8h, v1.h[2]\n"
+      ".inst 0x4f42fbb2  // bfdot v18.4s, v29.8h, v2.h[2]\n"
+      ".inst 0x4f43fbb6  // bfdot v22.4s, v29.8h, v3.h[2]\n"
+      ".inst 0x4f44fbba  // bfdot v26.4s, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      ".inst 0x4f40fb8b  // bfdot v11.4s, v28.8h, v0.h[2]\n"
+      ".inst 0x4f41fb8f  // bfdot v15.4s, v28.8h, v1.h[2]\n"
+      ".inst 0x4f42fb93  // bfdot v19.4s, v28.8h, v2.h[2]\n"
+      ".inst 0x4f43fb97  // bfdot v23.4s, v28.8h, v3.h[2]\n"
+      ".inst 0x4f44fb9b  // bfdot v27.4s, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      ".inst 0x4f60fba8  // bfdot v8.4s, v29.8h, v0.h[3]\n"
+      ".inst 0x4f61fbac  // bfdot v12.4s, v29.8h, v1.h[3]\n"
+      ".inst 0x4f62fbb0  // bfdot v16.4s, v29.8h, v2.h[3]\n"
+      ".inst 0x4f63fbb4  // bfdot v20.4s, v29.8h, v3.h[3]\n"
+      ".inst 0x4f64fbb8  // bfdot v24.4s, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      ".inst 0x4f60fb89  // bfdot v9.4s, v28.8h, v0.h[3]\n"
+      ".inst 0x4f61fb8d  // bfdot v13.4s, v28.8h, v1.h[3]\n"
+      ".inst 0x4f62fb91  // bfdot v17.4s, v28.8h, v2.h[3]\n"
+      ".inst 0x4f63fb95  // bfdot v21.4s, v28.8h, v3.h[3]\n"
+      ".inst 0x4f64fb99  // bfdot v25.4s, v28.8h, v4.h[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
-      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
-      ".inst 0x4f64f8da  // bfdot v26.4s, v6.8h, v4.h[3]\n"
-      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
-      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
-      ".inst 0x4f64f8fb  // bfdot v27.4s, v7.8h, v4.h[3]\n"
+      ".inst 0x4f60fbaa  // bfdot v10.4s, v29.8h, v0.h[3]\n"
+      ".inst 0x4f61fbae  // bfdot v14.4s, v29.8h, v1.h[3]\n"
+      ".inst 0x4f62fbb2  // bfdot v18.4s, v29.8h, v2.h[3]\n"
+      ".inst 0x4f63fbb6  // bfdot v22.4s, v29.8h, v3.h[3]\n"
+      ".inst 0x4f64fbba  // bfdot v26.4s, v29.8h, v4.h[3]\n"
+      ".inst 0x4f60fb8b  // bfdot v11.4s, v28.8h, v0.h[3]\n"
+      ".inst 0x4f61fb8f  // bfdot v15.4s, v28.8h, v1.h[3]\n"
+      ".inst 0x4f62fb93  // bfdot v19.4s, v28.8h, v2.h[3]\n"
+      ".inst 0x4f63fb97  // bfdot v23.4s, v28.8h, v3.h[3]\n"
+      ".inst 0x4f64fb9b  // bfdot v27.4s, v28.8h, v4.h[3]\n"
       "160:"  // Height 5: Multiply loop: Main loop skip
       "cbz x27, 164f\n"
       "cmp x27, #0x2\n"
       "blt 162f\n"
       "161:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
       "ldr s1, [x25], #0x4\n"
       "sub x27, x27, #0x2\n"
       "cmp x27, #0x2\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s31, [x23], #0x4\n"
+      "ldr s30, [x22], #0x4\n"
+      "ldr q29, [x10, #0x0]\n"
+      ".inst 0x4f42f3a8  // bfdot v8.4s, v29.8h, v2.h[0]\n"
+      ".inst 0x4f41f3ac  // bfdot v12.4s, v29.8h, v1.h[0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      ".inst 0x4f40f3b0  // bfdot v16.4s, v29.8h, v0.h[0]\n"
+      ".inst 0x4f5ff3b4  // bfdot v20.4s, v29.8h, v31.h[0]\n"
+      ".inst 0x4f5ef3b8  // bfdot v24.4s, v29.8h, v30.h[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      ".inst 0x4f42f389  // bfdot v9.4s, v28.8h, v2.h[0]\n"
+      ".inst 0x4f41f38d  // bfdot v13.4s, v28.8h, v1.h[0]\n"
+      ".inst 0x4f40f391  // bfdot v17.4s, v28.8h, v0.h[0]\n"
+      ".inst 0x4f5ff395  // bfdot v21.4s, v28.8h, v31.h[0]\n"
+      ".inst 0x4f5ef399  // bfdot v25.4s, v28.8h, v30.h[0]\n"
+      "ldr q28, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f42f3aa  // bfdot v10.4s, v29.8h, v2.h[0]\n"
+      ".inst 0x4f41f3ae  // bfdot v14.4s, v29.8h, v1.h[0]\n"
+      ".inst 0x4f40f3b2  // bfdot v18.4s, v29.8h, v0.h[0]\n"
+      ".inst 0x4f5ff3b6  // bfdot v22.4s, v29.8h, v31.h[0]\n"
+      ".inst 0x4f5ef3ba  // bfdot v26.4s, v29.8h, v30.h[0]\n"
+      ".inst 0x4f42f38b  // bfdot v11.4s, v28.8h, v2.h[0]\n"
+      ".inst 0x4f41f38f  // bfdot v15.4s, v28.8h, v1.h[0]\n"
+      ".inst 0x4f40f393  // bfdot v19.4s, v28.8h, v0.h[0]\n"
+      ".inst 0x4f5ff397  // bfdot v23.4s, v28.8h, v31.h[0]\n"
+      ".inst 0x4f5ef39b  // bfdot v27.4s, v28.8h, v30.h[0]\n"
       "bge 161b\n"
       "162:"  // Height 5: Multiply loop: Skip odd blocks
       "cbz x27, 164f\n"
@@ -2441,31 +2440,31 @@
       "ldr h3, [x23, #0x0]\n"
       "ldr h4, [x22, #0x0]\n"
       "163:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q29, [x10, #0x0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      ".inst 0x4f40f3a8  // bfdot v8.4s, v29.8h, v0.h[0]\n"
+      ".inst 0x4f41f3ac  // bfdot v12.4s, v29.8h, v1.h[0]\n"
+      ".inst 0x4f42f3b0  // bfdot v16.4s, v29.8h, v2.h[0]\n"
+      ".inst 0x4f43f3b4  // bfdot v20.4s, v29.8h, v3.h[0]\n"
+      ".inst 0x4f44f3b8  // bfdot v24.4s, v29.8h, v4.h[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      ".inst 0x4f40f389  // bfdot v9.4s, v28.8h, v0.h[0]\n"
+      ".inst 0x4f41f38d  // bfdot v13.4s, v28.8h, v1.h[0]\n"
+      ".inst 0x4f42f391  // bfdot v17.4s, v28.8h, v2.h[0]\n"
+      ".inst 0x4f43f395  // bfdot v21.4s, v28.8h, v3.h[0]\n"
+      ".inst 0x4f44f399  // bfdot v25.4s, v28.8h, v4.h[0]\n"
+      "ldr q28, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f40f3aa  // bfdot v10.4s, v29.8h, v0.h[0]\n"
+      ".inst 0x4f41f3ae  // bfdot v14.4s, v29.8h, v1.h[0]\n"
+      ".inst 0x4f42f3b2  // bfdot v18.4s, v29.8h, v2.h[0]\n"
+      ".inst 0x4f43f3b6  // bfdot v22.4s, v29.8h, v3.h[0]\n"
+      ".inst 0x4f44f3ba  // bfdot v26.4s, v29.8h, v4.h[0]\n"
+      ".inst 0x4f40f38b  // bfdot v11.4s, v28.8h, v0.h[0]\n"
+      ".inst 0x4f41f38f  // bfdot v15.4s, v28.8h, v1.h[0]\n"
+      ".inst 0x4f42f393  // bfdot v19.4s, v28.8h, v2.h[0]\n"
+      ".inst 0x4f43f397  // bfdot v23.4s, v28.8h, v3.h[0]\n"
+      ".inst 0x4f44f39b  // bfdot v27.4s, v28.8h, v4.h[0]\n"
       "164:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -2483,49 +2482,49 @@
       "prfm pstl1keep, [x22, #0x0]\n"
       "tbz %x[flags], #1, 165f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v29.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmin v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v1.4s\n"
-      "fmin v22.4s, v22.4s, v1.4s\n"
-      "fmin v23.4s, v23.4s, v1.4s\n"
-      "fmin v24.4s, v24.4s, v1.4s\n"
-      "fmin v25.4s, v25.4s, v1.4s\n"
-      "fmin v26.4s, v26.4s, v1.4s\n"
-      "fmin v27.4s, v27.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
-      "fmax v20.4s, v20.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v0.4s\n"
-      "fmax v22.4s, v22.4s, v0.4s\n"
-      "fmax v23.4s, v23.4s, v0.4s\n"
-      "fmax v24.4s, v24.4s, v0.4s\n"
-      "fmax v25.4s, v25.4s, v0.4s\n"
-      "fmax v26.4s, v26.4s, v0.4s\n"
-      "fmax v27.4s, v27.4s, v0.4s\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v29.4s\n"
+      "fmin v9.4s, v9.4s, v29.4s\n"
+      "fmin v10.4s, v10.4s, v29.4s\n"
+      "fmin v11.4s, v11.4s, v29.4s\n"
+      "fmin v12.4s, v12.4s, v29.4s\n"
+      "fmin v13.4s, v13.4s, v29.4s\n"
+      "fmin v14.4s, v14.4s, v29.4s\n"
+      "fmin v15.4s, v15.4s, v29.4s\n"
+      "fmin v16.4s, v16.4s, v29.4s\n"
+      "fmin v17.4s, v17.4s, v29.4s\n"
+      "fmin v18.4s, v18.4s, v29.4s\n"
+      "fmin v19.4s, v19.4s, v29.4s\n"
+      "fmin v20.4s, v20.4s, v29.4s\n"
+      "fmin v21.4s, v21.4s, v29.4s\n"
+      "fmin v22.4s, v22.4s, v29.4s\n"
+      "fmin v23.4s, v23.4s, v29.4s\n"
+      "fmin v24.4s, v24.4s, v29.4s\n"
+      "fmin v25.4s, v25.4s, v29.4s\n"
+      "fmin v26.4s, v26.4s, v29.4s\n"
+      "fmin v27.4s, v27.4s, v29.4s\n"
+      "fmax v8.4s, v8.4s, v28.4s\n"
+      "fmax v9.4s, v9.4s, v28.4s\n"
+      "fmax v10.4s, v10.4s, v28.4s\n"
+      "fmax v11.4s, v11.4s, v28.4s\n"
+      "fmax v12.4s, v12.4s, v28.4s\n"
+      "fmax v13.4s, v13.4s, v28.4s\n"
+      "fmax v14.4s, v14.4s, v28.4s\n"
+      "fmax v15.4s, v15.4s, v28.4s\n"
+      "fmax v16.4s, v16.4s, v28.4s\n"
+      "fmax v17.4s, v17.4s, v28.4s\n"
+      "fmax v18.4s, v18.4s, v28.4s\n"
+      "fmax v19.4s, v19.4s, v28.4s\n"
+      "fmax v20.4s, v20.4s, v28.4s\n"
+      "fmax v21.4s, v21.4s, v28.4s\n"
+      "fmax v22.4s, v22.4s, v28.4s\n"
+      "fmax v23.4s, v23.4s, v28.4s\n"
+      "fmax v24.4s, v24.4s, v28.4s\n"
+      "fmax v25.4s, v25.4s, v28.4s\n"
+      "fmax v26.4s, v26.4s, v28.4s\n"
+      "fmax v27.4s, v27.4s, v28.4s\n"
       "165:"  // Height 5: No activation
       "cmp x11, #0x10\n"
       "bge 174f\n"
@@ -2902,16 +2901,16 @@
       "190:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 191f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 192f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -2923,11 +2922,11 @@
       "b 192f\n"
       "191:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "192:"  // Height 6: input setup done
       "cmp x27, #0x8\n"
       "blt 195f\n"
@@ -3206,43 +3205,43 @@
       "cmp x27, #0x2\n"
       "blt 197f\n"
       "196:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s7, [x26], #0x4\n"
+      "ldr s6, [x25], #0x4\n"
       "sub x27, x27, #0x2\n"
       "cmp x27, #0x2\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr s5, [x21], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
-      ".inst 0x4f45f0dc  // bfdot v28.4s, v6.8h, v5.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
-      ".inst 0x4f45f0fd  // bfdot v29.4s, v7.8h, v5.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s3, [x22], #0x4\n"
+      "ldr s2, [x21], #0x4\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x4f47f028  // bfdot v8.4s, v1.8h, v7.h[0]\n"
+      ".inst 0x4f46f02c  // bfdot v12.4s, v1.8h, v6.h[0]\n"
+      ".inst 0x4f45f030  // bfdot v16.4s, v1.8h, v5.h[0]\n"
+      ".inst 0x4f44f034  // bfdot v20.4s, v1.8h, v4.h[0]\n"
+      ".inst 0x4f43f038  // bfdot v24.4s, v1.8h, v3.h[0]\n"
+      ".inst 0x4f42f03c  // bfdot v28.4s, v1.8h, v2.h[0]\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x4f47f009  // bfdot v9.4s, v0.8h, v7.h[0]\n"
+      ".inst 0x4f46f00d  // bfdot v13.4s, v0.8h, v6.h[0]\n"
+      ".inst 0x4f45f011  // bfdot v17.4s, v0.8h, v5.h[0]\n"
+      ".inst 0x4f44f015  // bfdot v21.4s, v0.8h, v4.h[0]\n"
+      ".inst 0x4f43f019  // bfdot v25.4s, v0.8h, v3.h[0]\n"
+      ".inst 0x4f42f01d  // bfdot v29.4s, v0.8h, v2.h[0]\n"
+      "ldr q0, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
-      ".inst 0x4f45f0de  // bfdot v30.4s, v6.8h, v5.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
-      ".inst 0x4f45f0ff  // bfdot v31.4s, v7.8h, v5.h[0]\n"
+      ".inst 0x4f47f02a  // bfdot v10.4s, v1.8h, v7.h[0]\n"
+      ".inst 0x4f46f02e  // bfdot v14.4s, v1.8h, v6.h[0]\n"
+      ".inst 0x4f45f032  // bfdot v18.4s, v1.8h, v5.h[0]\n"
+      ".inst 0x4f44f036  // bfdot v22.4s, v1.8h, v4.h[0]\n"
+      ".inst 0x4f43f03a  // bfdot v26.4s, v1.8h, v3.h[0]\n"
+      ".inst 0x4f42f03e  // bfdot v30.4s, v1.8h, v2.h[0]\n"
+      ".inst 0x4f47f00b  // bfdot v11.4s, v0.8h, v7.h[0]\n"
+      ".inst 0x4f46f00f  // bfdot v15.4s, v0.8h, v6.h[0]\n"
+      ".inst 0x4f45f013  // bfdot v19.4s, v0.8h, v5.h[0]\n"
+      ".inst 0x4f44f017  // bfdot v23.4s, v0.8h, v4.h[0]\n"
+      ".inst 0x4f43f01b  // bfdot v27.4s, v0.8h, v3.h[0]\n"
+      ".inst 0x4f42f01f  // bfdot v31.4s, v0.8h, v2.h[0]\n"
       "bge 196b\n"
       "197:"  // Height 6: Multiply loop: Skip odd blocks
       "cbz x27, 199f\n"
@@ -3253,35 +3252,35 @@
       "ldr h4, [x22, #0x0]\n"
       "ldr h5, [x21, #0x0]\n"
       "198:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
-      ".inst 0x4f45f0dc  // bfdot v28.4s, v6.8h, v5.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
-      ".inst 0x4f45f0fd  // bfdot v29.4s, v7.8h, v5.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x4f40f0e8  // bfdot v8.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ec  // bfdot v12.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f0  // bfdot v16.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f4  // bfdot v20.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0f8  // bfdot v24.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0fc  // bfdot v28.4s, v7.8h, v5.h[0]\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x4f40f0c9  // bfdot v9.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0cd  // bfdot v13.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d1  // bfdot v17.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d5  // bfdot v21.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0d9  // bfdot v25.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0dd  // bfdot v29.4s, v6.8h, v5.h[0]\n"
+      "ldr q6, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
-      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
-      ".inst 0x4f45f0de  // bfdot v30.4s, v6.8h, v5.h[0]\n"
-      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
-      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
-      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
-      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
-      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
-      ".inst 0x4f45f0ff  // bfdot v31.4s, v7.8h, v5.h[0]\n"
+      ".inst 0x4f40f0ea  // bfdot v10.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ee  // bfdot v14.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f2  // bfdot v18.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f6  // bfdot v22.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fa  // bfdot v26.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0fe  // bfdot v30.4s, v7.8h, v5.h[0]\n"
+      ".inst 0x4f40f0cb  // bfdot v11.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0cf  // bfdot v15.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d3  // bfdot v19.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d7  // bfdot v23.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0db  // bfdot v27.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0df  // bfdot v31.4s, v6.8h, v5.h[0]\n"
       "199:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -3522,7 +3521,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "212:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp
index 8cb743b..d9e7259 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../bfloat.hpp"
 #include "../performance_parameters.hpp"
@@ -99,5 +99,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp
index 5a000c6..f6389e2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp
@@ -93,7 +93,6 @@
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 186f\n"
@@ -211,11 +210,11 @@
       "16:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 17f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 18f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -231,41 +230,41 @@
       "ldr q6, [x10, #0x10]\n"
       "blt 20f\n"
       "19:"  // Height 1: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x70]\n"
-      "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0xf0]\n"
+      "trn1 v20.2d, v1.2d, v21.2d\n"
+      ".inst 0x6e47ee88  // bfmmla v8.4s, v20.8h, v7.8h\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x6e46ee8c  // bfmmla v12.4s, v20.8h, v6.8h\n"
+      "ldr q19, [x10, #0x30]\n"
+      ".inst 0x6e51ee89  // bfmmla v9.4s, v20.8h, v17.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e53ee8d  // bfmmla v13.4s, v20.8h, v19.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ee8a  // bfmmla v10.4s, v20.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ee8e  // bfmmla v14.4s, v20.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v21.2d\n"
+      ".inst 0x6e52ee8b  // bfmmla v11.4s, v20.8h, v18.8h\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x6e51ee8f  // bfmmla v15.4s, v20.8h, v17.8h\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x6e52ec28  // bfmmla v8.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e51ec2c  // bfmmla v12.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e52ec29  // bfmmla v9.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xf0]\n"
       "sub x27, x27, #0x8\n"
       "add x26, x26, #0x10\n"
       "cmp x27, #0x10\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
       "ldr q1, [x26, #0x0]\n"
       "add x10, x10, #0x100\n"
       "ldr q7, [x10, #0x0]\n"
@@ -273,40 +272,40 @@
       "prfm pldl1keep, [x26, #0x80]\n"
       "bge 19b\n"
       "20:"  // Height 1: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x70]\n"
-      "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0xf0]\n"
+      "trn1 v19.2d, v1.2d, v20.2d\n"
+      ".inst 0x6e47ee68  // bfmmla v8.4s, v19.8h, v7.8h\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x6e46ee6c  // bfmmla v12.4s, v19.8h, v6.8h\n"
+      "ldr q18, [x10, #0x30]\n"
+      ".inst 0x6e51ee69  // bfmmla v9.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x6e52ee6d  // bfmmla v13.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x50]\n"
+      ".inst 0x6e51ee6a  // bfmmla v10.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x60]\n"
+      ".inst 0x6e52ee6e  // bfmmla v14.4s, v19.8h, v18.8h\n"
+      "ldr q24, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v20.2d\n"
+      ".inst 0x6e51ee6b  // bfmmla v11.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x6e58ee6f  // bfmmla v15.4s, v19.8h, v24.8h\n"
+      "ldr q2, [x10, #0x90]\n"
+      ".inst 0x6e51ec28  // bfmmla v8.4s, v1.8h, v17.8h\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e42ec2c  // bfmmla v12.4s, v1.8h, v2.8h\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e52ec29  // bfmmla v9.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xf0]\n"
       "add x26, x26, #0x10\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "add x10, x10, #0x100\n"
       "21:"  // Height 1: Multiply loop: Main loop skip
@@ -314,26 +313,26 @@
       "cmp x27, #0x4\n"
       "blt 23f\n"
       "22:"  // Height 1: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr q6, [x10, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      "ldr d19, [x26], #0x8\n"
+      "ldr q18, [x10, #0x0]\n"
+      "trn1 v19.2d, v19.2d, v17.2d\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e52ee68  // bfmmla v8.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ee6c  // bfmmla v12.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
       "add x10, x10, #0x80\n"
       "bge 22b\n"
       "23:"  // Height 1: Multiply loop: Skip odd blocks
@@ -346,23 +345,23 @@
       "24:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr h1, [x26, #0x0]\n"
       "25:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      "ldr q20, [x10, #0x0]\n"
+      "ldr q18, [x10, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v17.2d\n"
+      ".inst 0x6e54ee68  // bfmmla v8.4s, v19.8h, v20.8h\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x6e52ee6c  // bfmmla v12.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x30]\n"
+      ".inst 0x6e51ee69  // bfmmla v9.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x6e52ee6d  // bfmmla v13.4s, v19.8h, v18.8h\n"
+      "ldr q2, [x10, #0x50]\n"
+      ".inst 0x6e51ee6a  // bfmmla v10.4s, v19.8h, v17.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e42ee6e  // bfmmla v14.4s, v19.8h, v2.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
       "add x10, x10, #0x80\n"
       "26:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -376,17 +375,17 @@
       "uzp1 v11.2d, v11.2d, v15.2d\n"
       "tbz %x[flags], #1, 27f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v18.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v18.4s\n"
+      "fmin v9.4s, v9.4s, v18.4s\n"
+      "fmin v10.4s, v10.4s, v18.4s\n"
+      "fmin v11.4s, v11.4s, v18.4s\n"
+      "fmax v8.4s, v8.4s, v17.4s\n"
+      "fmax v9.4s, v9.4s, v17.4s\n"
+      "fmax v10.4s, v10.4s, v17.4s\n"
+      "fmax v11.4s, v11.4s, v17.4s\n"
       "27:"  // Height 1: No activation
       "cmp x11, #0x10\n"
       "bge 36f\n"
@@ -577,12 +576,12 @@
       "53:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 54f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 55f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -590,7 +589,7 @@
       "b 55f\n"
       "54:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
       "55:"  // Height 2: input setup done
       "cmp x27, #0x8\n"
       "blt 58f\n"
@@ -601,85 +600,85 @@
       "ldr q6, [x10, #0x10]\n"
       "blt 57f\n"
       "56:"  // Height 2: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x70]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ee68  // bfmmla v8.4s, v19.8h, v7.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e46ee6c  // bfmmla v12.4s, v19.8h, v6.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x6e52ec28  // bfmmla v8.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e51ec2c  // bfmmla v12.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e52ec29  // bfmmla v9.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xf0]\n"
       "sub x27, x27, #0x8\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "ldr q2, [x25, #0x0]\n"
       "cmp x27, #0x10\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
       "add x10, x10, #0x100\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
       "ldr q1, [x26, #0x0]\n"
       "ldr q6, [x10, #0x10]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "bge 56b\n"
       "57:"  // Height 2: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x70]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ee68  // bfmmla v8.4s, v19.8h, v7.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e46ee6c  // bfmmla v12.4s, v19.8h, v6.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x6e52ec28  // bfmmla v8.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e51ec2c  // bfmmla v12.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e52ec29  // bfmmla v9.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e51ec2d  // bfmmla v13.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e52ec2a  // bfmmla v10.4s, v1.8h, v18.8h\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e51ec2e  // bfmmla v14.4s, v1.8h, v17.8h\n"
+      "ldr q17, [x10, #0xf0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e52ec2b  // bfmmla v11.4s, v1.8h, v18.8h\n"
+      ".inst 0x6e51ec2f  // bfmmla v15.4s, v1.8h, v17.8h\n"
       "sub x27, x27, #0x8\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
@@ -689,27 +688,27 @@
       "cmp x27, #0x4\n"
       "blt 60f\n"
       "59:"  // Height 2: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d18, [x26], #0x8\n"
+      "ldr d17, [x25], #0x8\n"
+      "trn1 v19.2d, v18.2d, v17.2d\n"
       "sub x27, x27, #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      "ldr q6, [x10, #0x20]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      "ldr q7, [x10, #0x70]\n"
+      "ldr q18, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e52ee68  // bfmmla v8.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6c  // bfmmla v12.4s, v19.8h, v17.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      "ldr q5, [x10, #0x30]\n"
+      ".inst 0x6e5aee69  // bfmmla v9.4s, v19.8h, v26.8h\n"
+      ".inst 0x6e45ee6d  // bfmmla v13.4s, v19.8h, v5.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ee6a  // bfmmla v10.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6e  // bfmmla v14.4s, v19.8h, v17.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      "ldr q17, [x10, #0x70]\n"
       "cmp x27, #0x4\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
       "add x10, x10, #0x80\n"
       "bge 59b\n"
       "60:"  // Height 2: Multiply loop: Skip odd blocks
@@ -725,23 +724,23 @@
       "ldr h1, [x26, #0x0]\n"
       "ldr h2, [x25, #0x0]\n"
       "62:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      "ldr q18, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e52ee68  // bfmmla v8.4s, v19.8h, v18.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ee6c  // bfmmla v12.4s, v19.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ee69  // bfmmla v9.4s, v19.8h, v18.8h\n"
+      "ldr q30, [x10, #0x40]\n"
+      ".inst 0x6e51ee6d  // bfmmla v13.4s, v19.8h, v17.8h\n"
+      "ldr q26, [x10, #0x50]\n"
+      ".inst 0x6e5eee6a  // bfmmla v10.4s, v19.8h, v30.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e5aee6e  // bfmmla v14.4s, v19.8h, v26.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x6e52ee6b  // bfmmla v11.4s, v19.8h, v18.8h\n"
+      ".inst 0x6e51ee6f  // bfmmla v15.4s, v19.8h, v17.8h\n"
       "add x10, x10, #0x80\n"
       "63:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -762,25 +761,25 @@
       "uzp2 v11.2d, v11.2d, v15.2d\n"
       "tbz %x[flags], #1, 64f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v18.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v7.4s, v7.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmax v7.4s, v7.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v18.4s\n"
+      "fmin v12.4s, v12.4s, v18.4s\n"
+      "fmin v13.4s, v13.4s, v18.4s\n"
+      "fmin v14.4s, v14.4s, v18.4s\n"
+      "fmin v8.4s, v8.4s, v18.4s\n"
+      "fmin v9.4s, v9.4s, v18.4s\n"
+      "fmin v10.4s, v10.4s, v18.4s\n"
+      "fmin v11.4s, v11.4s, v18.4s\n"
+      "fmax v7.4s, v7.4s, v17.4s\n"
+      "fmax v12.4s, v12.4s, v17.4s\n"
+      "fmax v13.4s, v13.4s, v17.4s\n"
+      "fmax v14.4s, v14.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v17.4s\n"
+      "fmax v9.4s, v9.4s, v17.4s\n"
+      "fmax v10.4s, v10.4s, v17.4s\n"
+      "fmax v11.4s, v11.4s, v17.4s\n"
       "64:"  // Height 2: No activation
       "cmp x11, #0x10\n"
       "bge 73f\n"
@@ -1036,13 +1035,13 @@
       "90:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 91f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 92f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1051,8 +1050,8 @@
       "b 92f\n"
       "91:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "92:"  // Height 3: input setup done
       "cmp x27, #0x8\n"
       "blt 95f\n"
@@ -1064,167 +1063,167 @@
       "ldr q6, [x10, #0x10]\n"
       "blt 94f\n"
       "93:"  // Height 3: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
       "cmp x27, #0x10\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x90]\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
       "ldr q3, [x24, #0x0]\n"
       "ldr q6, [x10, #0x10]\n"
       "bge 93b\n"
       "94:"  // Height 3: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x90]\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
       "95:"  // Height 3: Multiply loop: Main loop skip
       "cbz x27, 100f\n"
       "cmp x27, #0x4\n"
       "blt 97f\n"
       "96:"  // Height 3: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr q6, [x10, #0x0]\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr q26, [x10, #0x0]\n"
+      "trn1 v27.2d, v25.2d, v27.2d\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ef8c  // bfmmla v12.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef74  // bfmmla v20.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
       "sub x27, x27, #0x4\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
       "cmp x27, #0x4\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
       "bge 96b\n"
       "97:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x27, 100f\n"
@@ -1242,33 +1241,33 @@
       "ldr h2, [x25, #0x0]\n"
       "ldr h3, [x24, #0x0]\n"
       "99:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x70]\n"
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q29, [x10, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v25.2d\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e5def8c  // bfmmla v12.4s, v28.8h, v29.8h\n"
+      ".inst 0x6e5def74  // bfmmla v20.4s, v27.8h, v29.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
       "100:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1294,33 +1293,33 @@
       "uzp1 v19.2d, v19.2d, v23.2d\n"
       "tbz %x[flags], #1, 101f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v7.4s, v7.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmax v7.4s, v7.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmax v7.4s, v7.4s, v25.4s\n"
+      "fmax v12.4s, v12.4s, v25.4s\n"
+      "fmax v13.4s, v13.4s, v25.4s\n"
+      "fmax v14.4s, v14.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v25.4s\n"
+      "fmax v9.4s, v9.4s, v25.4s\n"
+      "fmax v10.4s, v10.4s, v25.4s\n"
+      "fmax v11.4s, v11.4s, v25.4s\n"
+      "fmax v16.4s, v16.4s, v25.4s\n"
+      "fmax v17.4s, v17.4s, v25.4s\n"
+      "fmax v18.4s, v18.4s, v25.4s\n"
+      "fmax v19.4s, v19.4s, v25.4s\n"
       "101:"  // Height 3: No activation
       "cmp x11, #0x10\n"
       "bge 110f\n"
@@ -1617,14 +1616,14 @@
       "127:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 128f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 129f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1634,9 +1633,9 @@
       "b 129f\n"
       "128:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "129:"  // Height 4: input setup done
       "cmp x27, #0x8\n"
       "blt 132f\n"
@@ -1649,173 +1648,173 @@
       "ldr q6, [x10, #0x10]\n"
       "blt 131f\n"
       "130:"  // Height 4: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
       "sub x27, x27, #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
       "add x23, x23, #0x10\n"
       "ldr q4, [x23, #0x0]\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x90]\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
       "cmp x27, #0x10\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
       "ldr q3, [x24, #0x0]\n"
       "ldr q6, [x10, #0x10]\n"
       "bge 130b\n"
       "131:"  // Height 4: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ef88  // bfmmla v8.4s, v28.8h, v7.8h\n"
       "add x26, x26, #0x10\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ef70  // bfmmla v16.4s, v27.8h, v7.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e46ef8c  // bfmmla v12.4s, v28.8h, v6.8h\n"
+      ".inst 0x6e46ef74  // bfmmla v20.4s, v27.8h, v6.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x90]\n"
+      ".inst 0x6e5aec28  // bfmmla v8.4s, v1.8h, v26.8h\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e5aec70  // bfmmla v16.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e59ec2c  // bfmmla v12.4s, v1.8h, v25.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e59ec74  // bfmmla v20.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e5aec29  // bfmmla v9.4s, v1.8h, v26.8h\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x6e5aec71  // bfmmla v17.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e59ec2d  // bfmmla v13.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec75  // bfmmla v21.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e5aec2a  // bfmmla v10.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec72  // bfmmla v18.4s, v3.8h, v26.8h\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e59ec2e  // bfmmla v14.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec76  // bfmmla v22.4s, v3.8h, v25.8h\n"
+      "ldr q25, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e5aec2b  // bfmmla v11.4s, v1.8h, v26.8h\n"
+      ".inst 0x6e5aec73  // bfmmla v19.4s, v3.8h, v26.8h\n"
+      ".inst 0x6e59ec2f  // bfmmla v15.4s, v1.8h, v25.8h\n"
+      ".inst 0x6e59ec77  // bfmmla v23.4s, v3.8h, v25.8h\n"
       "132:"  // Height 4: Multiply loop: Main loop skip
       "cbz x27, 137f\n"
       "cmp x27, #0x4\n"
       "blt 134f\n"
       "133:"  // Height 4: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
       "sub x27, x27, #0x4\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d26, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "trn1 v27.2d, v26.2d, v25.2d\n"
       "cmp x27, #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ef8c  // bfmmla v12.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef74  // bfmmla v20.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
       "bge 133b\n"
       "134:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x27, 137f\n"
@@ -1836,33 +1835,33 @@
       "ldr h3, [x24, #0x0]\n"
       "ldr h4, [x23, #0x0]\n"
       "136:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x70]\n"
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e5aef88  // bfmmla v8.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef70  // bfmmla v16.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ef8c  // bfmmla v12.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef74  // bfmmla v20.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aef89  // bfmmla v9.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef71  // bfmmla v17.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ef8d  // bfmmla v13.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef75  // bfmmla v21.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aef8a  // bfmmla v10.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef72  // bfmmla v18.4s, v27.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ef8e  // bfmmla v14.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef76  // bfmmla v22.4s, v27.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e5aef8b  // bfmmla v11.4s, v28.8h, v26.8h\n"
+      ".inst 0x6e5aef73  // bfmmla v19.4s, v27.8h, v26.8h\n"
+      ".inst 0x6e59ef8f  // bfmmla v15.4s, v28.8h, v25.8h\n"
+      ".inst 0x6e59ef77  // bfmmla v23.4s, v27.8h, v25.8h\n"
       "137:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1894,41 +1893,41 @@
       "uzp2 v19.2d, v19.2d, v23.2d\n"
       "tbz %x[flags], #1, 138f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v7.4s, v7.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v1.4s\n"
-      "fmin v22.4s, v22.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmax v7.4s, v7.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v20.4s, v20.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v0.4s\n"
-      "fmax v22.4s, v22.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "fmin v7.4s, v7.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v15.4s, v15.4s, v26.4s\n"
+      "fmin v20.4s, v20.4s, v26.4s\n"
+      "fmin v21.4s, v21.4s, v26.4s\n"
+      "fmin v22.4s, v22.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmax v7.4s, v7.4s, v25.4s\n"
+      "fmax v12.4s, v12.4s, v25.4s\n"
+      "fmax v13.4s, v13.4s, v25.4s\n"
+      "fmax v14.4s, v14.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v25.4s\n"
+      "fmax v9.4s, v9.4s, v25.4s\n"
+      "fmax v10.4s, v10.4s, v25.4s\n"
+      "fmax v11.4s, v11.4s, v25.4s\n"
+      "fmax v15.4s, v15.4s, v25.4s\n"
+      "fmax v20.4s, v20.4s, v25.4s\n"
+      "fmax v21.4s, v21.4s, v25.4s\n"
+      "fmax v22.4s, v22.4s, v25.4s\n"
+      "fmax v16.4s, v16.4s, v25.4s\n"
+      "fmax v17.4s, v17.4s, v25.4s\n"
+      "fmax v18.4s, v18.4s, v25.4s\n"
+      "fmax v19.4s, v19.4s, v25.4s\n"
       "138:"  // Height 4: No activation
       "cmp x11, #0x10\n"
       "bge 147f\n"
@@ -2290,15 +2289,15 @@
       "164:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 165f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 166f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -2309,10 +2308,10 @@
       "b 166f\n"
       "165:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "166:"  // Height 5: input setup done
       "cmp x27, #0x8\n"
       "blt 169f\n"
@@ -2325,174 +2324,174 @@
       "ldr q7, [x10, #0x0]\n"
       "blt 168f\n"
       "167:"  // Height 5: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v6.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ecc8  // bfmmla v8.4s, v6.8h, v7.8h\n"
       "trn1 v2.2d, v3.2d, v4.2d\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
       ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
       "sub x27, x27, #0x8\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "trn2 v5.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x10, #0x10]\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x10, #0x10]\n"
       ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
       "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40eccc  // bfmmla v12.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec54  // bfmmla v20.4s, v2.8h, v0.8h\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40ec9c  // bfmmla v28.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e47ecc9  // bfmmla v9.4s, v6.8h, v7.8h\n"
       "add x25, x25, #0x10\n"
       ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
       ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
       "ldr q7, [x10, #0x40]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40eccd  // bfmmla v13.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec55  // bfmmla v21.4s, v2.8h, v0.8h\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40ec9d  // bfmmla v29.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e47ecca  // bfmmla v10.4s, v6.8h, v7.8h\n"
       "cmp x27, #0x10\n"
       ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
       ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
       "ldr q7, [x10, #0x60]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40ecce  // bfmmla v14.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec56  // bfmmla v22.4s, v2.8h, v0.8h\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40ec9e  // bfmmla v30.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x6e47eccb  // bfmmla v11.4s, v6.8h, v7.8h\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
       ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
       "ldr q7, [x10, #0x80]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40eccf  // bfmmla v15.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec57  // bfmmla v23.4s, v2.8h, v0.8h\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e40ec9f  // bfmmla v31.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x90]\n"
       "ldr q4, [x23, #0x0]\n"
       ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
       ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
       ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbc  // bfmmla v28.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecb9  // bfmmla v25.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbd  // bfmmla v29.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecba  // bfmmla v26.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbe  // bfmmla v30.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x10, #0xf0]\n"
+      "ldr q6, [x10, #0xa0]\n"
+      ".inst 0x6e40ec2c  // bfmmla v12.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbc  // bfmmla v28.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xb0]\n"
+      ".inst 0x6e46ec29  // bfmmla v9.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec71  // bfmmla v17.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecb9  // bfmmla v25.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xc0]\n"
+      ".inst 0x6e40ec2d  // bfmmla v13.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbd  // bfmmla v29.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xd0]\n"
+      ".inst 0x6e46ec2a  // bfmmla v10.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec72  // bfmmla v18.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecba  // bfmmla v26.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xe0]\n"
+      ".inst 0x6e40ec2e  // bfmmla v14.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbe  // bfmmla v30.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecbb  // bfmmla v27.4s, v5.8h, v7.8h\n"
+      ".inst 0x6e46ec2b  // bfmmla v11.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbb  // bfmmla v27.4s, v5.8h, v6.8h\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e40ec2f  // bfmmla v15.4s, v1.8h, v0.8h\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
       "ldr q3, [x24, #0x0]\n"
-      ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
+      ".inst 0x6e40ecbf  // bfmmla v31.4s, v5.8h, v0.8h\n"
       "ldr q5, [x22, #0x0]\n"
       "bge 167b\n"
       "168:"  // Height 5: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v6.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ecc8  // bfmmla v8.4s, v6.8h, v7.8h\n"
       "trn1 v2.2d, v3.2d, v4.2d\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
       ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
       "add x26, x26, #0x10\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "trn2 v5.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x10, #0x10]\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x10, #0x10]\n"
       ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
       "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40eccc  // bfmmla v12.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec54  // bfmmla v20.4s, v2.8h, v0.8h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40ec9c  // bfmmla v28.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e47ecc9  // bfmmla v9.4s, v6.8h, v7.8h\n"
       "add x24, x24, #0x10\n"
       ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
       ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
       "ldr q7, [x10, #0x40]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40eccd  // bfmmla v13.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec55  // bfmmla v21.4s, v2.8h, v0.8h\n"
       "add x22, x22, #0x10\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40ec9d  // bfmmla v29.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e47ecca  // bfmmla v10.4s, v6.8h, v7.8h\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
       ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
       "ldr q7, [x10, #0x60]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40ecce  // bfmmla v14.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec56  // bfmmla v22.4s, v2.8h, v0.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40ec9e  // bfmmla v30.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x6e47eccb  // bfmmla v11.4s, v6.8h, v7.8h\n"
       "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
       ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
       "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e40eccf  // bfmmla v15.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e40ec57  // bfmmla v23.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e40ec9f  // bfmmla v31.4s, v4.8h, v0.8h\n"
+      "ldr q2, [x10, #0x90]\n"
       ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
       ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
       ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbc  // bfmmla v28.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecb9  // bfmmla v25.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbd  // bfmmla v29.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecba  // bfmmla v26.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbe  // bfmmla v30.4s, v5.8h, v6.8h\n"
+      "ldr q0, [x10, #0xa0]\n"
+      ".inst 0x6e42ec2c  // bfmmla v12.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec74  // bfmmla v20.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbc  // bfmmla v28.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x10, #0xb0]\n"
+      ".inst 0x6e40ec29  // bfmmla v9.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecb9  // bfmmla v25.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xc0]\n"
+      ".inst 0x6e42ec2d  // bfmmla v13.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec75  // bfmmla v21.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbd  // bfmmla v29.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x10, #0xd0]\n"
+      ".inst 0x6e40ec2a  // bfmmla v10.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecba  // bfmmla v26.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xe0]\n"
+      ".inst 0x6e42ec2e  // bfmmla v14.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec76  // bfmmla v22.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbe  // bfmmla v30.4s, v5.8h, v2.8h\n"
       "ldr q6, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecbb  // bfmmla v27.4s, v5.8h, v7.8h\n"
+      ".inst 0x6e40ec2b  // bfmmla v11.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbb  // bfmmla v27.4s, v5.8h, v0.8h\n"
       ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
       ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
       ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
@@ -2502,48 +2501,48 @@
       "blt 171f\n"
       "170:"  // Height 5: Multiply loop: Odd block loop
       "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
       "sub x27, x27, #0x4\n"
-      "ldr d5, [x22], #0x8\n"
-      "ldr q6, [x10, #0x0]\n"
-      "trn1 v4.2d, v5.2d, v7.2d\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr d0, [x22], #0x8\n"
+      "ldr q1, [x10, #0x0]\n"
+      "trn1 v2.2d, v0.2d, v2.2d\n"
+      ".inst 0x6e41ec88  // bfmmla v8.4s, v4.8h, v1.8h\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e41ec70  // bfmmla v16.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec58  // bfmmla v24.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x6e40ec8c  // bfmmla v12.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
       "cmp x27, #0x4\n"
-      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e40ec5c  // bfmmla v28.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e41ec89  // bfmmla v9.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec71  // bfmmla v17.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec59  // bfmmla v25.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x40]\n"
+      ".inst 0x6e40ec8d  // bfmmla v13.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5d  // bfmmla v29.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e41ec8a  // bfmmla v10.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec72  // bfmmla v18.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5a  // bfmmla v26.4s, v2.8h, v1.8h\n"
       "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e40ec8e  // bfmmla v14.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5e  // bfmmla v30.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x6e46ec8b  // bfmmla v11.4s, v4.8h, v6.8h\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5b  // bfmmla v27.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40ec8f  // bfmmla v15.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5f  // bfmmla v31.4s, v2.8h, v0.8h\n"
       "bge 170b\n"
       "171:"  // Height 5: Multiply loop: Skip odd blocks
       "cbz x27, 174f\n"
@@ -2567,42 +2566,42 @@
       "ldr h4, [x23, #0x0]\n"
       "ldr h5, [x22, #0x0]\n"
       "173:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      "trn1 v2.2d, v5.2d, v0.2d\n"
+      "ldr q1, [x10, #0x10]\n"
+      ".inst 0x6e46ece8  // bfmmla v8.4s, v7.8h, v6.8h\n"
+      ".inst 0x6e46ec70  // bfmmla v16.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec58  // bfmmla v24.4s, v2.8h, v6.8h\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x6e41ecec  // bfmmla v12.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec74  // bfmmla v20.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5c  // bfmmla v28.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x6e40ece9  // bfmmla v9.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec59  // bfmmla v25.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x40]\n"
+      ".inst 0x6e41eced  // bfmmla v13.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec75  // bfmmla v21.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e40ecea  // bfmmla v10.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5a  // bfmmla v26.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x60]\n"
+      ".inst 0x6e41ecee  // bfmmla v14.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec76  // bfmmla v22.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5e  // bfmmla v30.4s, v2.8h, v1.8h\n"
       "ldr q6, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e40eceb  // bfmmla v11.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5b  // bfmmla v27.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e46ecef  // bfmmla v15.4s, v7.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5f  // bfmmla v31.4s, v2.8h, v6.8h\n"
       "174:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -3088,16 +3087,16 @@
       "201:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 202f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 203f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -3109,11 +3108,11 @@
       "b 203f\n"
       "202:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "203:"  // Height 6: input setup done
       "cmp x27, #0x8\n"
       "blt 206f\n"
@@ -3180,42 +3179,42 @@
       "ldr q2, [x25, #0x0]\n"
       "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x90]\n"
+      "ldr q0, [x10, #0x90]\n"
       "ldr q4, [x23, #0x0]\n"
       ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
       ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
       ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbc  // bfmmla v28.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecb9  // bfmmla v25.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbd  // bfmmla v29.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecba  // bfmmla v26.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbe  // bfmmla v30.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x10, #0xf0]\n"
+      "ldr q6, [x10, #0xa0]\n"
+      ".inst 0x6e40ec2c  // bfmmla v12.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbc  // bfmmla v28.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xb0]\n"
+      ".inst 0x6e46ec29  // bfmmla v9.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec71  // bfmmla v17.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecb9  // bfmmla v25.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xc0]\n"
+      ".inst 0x6e40ec2d  // bfmmla v13.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbd  // bfmmla v29.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xd0]\n"
+      ".inst 0x6e46ec2a  // bfmmla v10.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec72  // bfmmla v18.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecba  // bfmmla v26.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xe0]\n"
+      ".inst 0x6e40ec2e  // bfmmla v14.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbe  // bfmmla v30.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecbb  // bfmmla v27.4s, v5.8h, v7.8h\n"
+      ".inst 0x6e46ec2b  // bfmmla v11.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbb  // bfmmla v27.4s, v5.8h, v6.8h\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e40ec2f  // bfmmla v15.4s, v1.8h, v0.8h\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
       "ldr q3, [x24, #0x0]\n"
-      ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
+      ".inst 0x6e40ecbf  // bfmmla v31.4s, v5.8h, v0.8h\n"
       "ldr q5, [x22, #0x0]\n"
       "ldr q6, [x21, #0x0]\n"
       "bge 204b\n"
@@ -3271,35 +3270,35 @@
       ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
       ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x90]\n"
+      "ldr q2, [x10, #0x90]\n"
       ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
       ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
       ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbc  // bfmmla v28.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecb9  // bfmmla v25.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbd  // bfmmla v29.4s, v5.8h, v6.8h\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecba  // bfmmla v26.4s, v5.8h, v7.8h\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e46ecbe  // bfmmla v30.4s, v5.8h, v6.8h\n"
+      "ldr q0, [x10, #0xa0]\n"
+      ".inst 0x6e42ec2c  // bfmmla v12.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec74  // bfmmla v20.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbc  // bfmmla v28.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x10, #0xb0]\n"
+      ".inst 0x6e40ec29  // bfmmla v9.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecb9  // bfmmla v25.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xc0]\n"
+      ".inst 0x6e42ec2d  // bfmmla v13.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec75  // bfmmla v21.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbd  // bfmmla v29.4s, v5.8h, v2.8h\n"
+      "ldr q2, [x10, #0xd0]\n"
+      ".inst 0x6e40ec2a  // bfmmla v10.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecba  // bfmmla v26.4s, v5.8h, v0.8h\n"
+      "ldr q0, [x10, #0xe0]\n"
+      ".inst 0x6e42ec2e  // bfmmla v14.4s, v1.8h, v2.8h\n"
+      ".inst 0x6e42ec76  // bfmmla v22.4s, v3.8h, v2.8h\n"
+      ".inst 0x6e42ecbe  // bfmmla v30.4s, v5.8h, v2.8h\n"
       "ldr q6, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e47ecbb  // bfmmla v27.4s, v5.8h, v7.8h\n"
+      ".inst 0x6e40ec2b  // bfmmla v11.4s, v1.8h, v0.8h\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ecbb  // bfmmla v27.4s, v5.8h, v0.8h\n"
       ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
       ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
       ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
@@ -3309,49 +3308,49 @@
       "blt 208f\n"
       "207:"  // Height 6: Multiply loop: Odd block loop
       "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
       "sub x27, x27, #0x4\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
       "cmp x27, #0x4\n"
-      "ldr d5, [x22], #0x8\n"
-      "ldr d7, [x21], #0x8\n"
-      "trn1 v4.2d, v5.2d, v7.2d\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      "ldr d1, [x22], #0x8\n"
+      "ldr d0, [x21], #0x8\n"
+      "trn1 v2.2d, v1.2d, v0.2d\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e41ec88  // bfmmla v8.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec70  // bfmmla v16.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec58  // bfmmla v24.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x6e40ec8c  // bfmmla v12.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec74  // bfmmla v20.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5c  // bfmmla v28.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e41ec89  // bfmmla v9.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec71  // bfmmla v17.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec59  // bfmmla v25.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x40]\n"
+      ".inst 0x6e40ec8d  // bfmmla v13.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec75  // bfmmla v21.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5d  // bfmmla v29.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e41ec8a  // bfmmla v10.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e41ec72  // bfmmla v18.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5a  // bfmmla v26.4s, v2.8h, v1.8h\n"
       "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x6e40ec8e  // bfmmla v14.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec76  // bfmmla v22.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5e  // bfmmla v30.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e46ec8b  // bfmmla v11.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e46ec73  // bfmmla v19.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5b  // bfmmla v27.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e40ec8f  // bfmmla v15.4s, v4.8h, v0.8h\n"
+      ".inst 0x6e40ec77  // bfmmla v23.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5f  // bfmmla v31.4s, v2.8h, v0.8h\n"
       "bge 207b\n"
       "208:"  // Height 6: Multiply loop: Skip odd blocks
       "cbz x27, 211f\n"
@@ -3378,42 +3377,42 @@
       "ldr h5, [x22, #0x0]\n"
       "ldr h6, [x21, #0x0]\n"
       "210:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
+      "ldr q0, [x10, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e40ece8  // bfmmla v8.4s, v7.8h, v0.8h\n"
+      "trn1 v2.2d, v5.2d, v6.2d\n"
+      "ldr q1, [x10, #0x10]\n"
+      ".inst 0x6e40ec70  // bfmmla v16.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec58  // bfmmla v24.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x6e41ecec  // bfmmla v12.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec74  // bfmmla v20.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5c  // bfmmla v28.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x6e40ece9  // bfmmla v9.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec71  // bfmmla v17.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec59  // bfmmla v25.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x40]\n"
+      ".inst 0x6e41eced  // bfmmla v13.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec75  // bfmmla v21.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e40ecea  // bfmmla v10.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e40ec72  // bfmmla v18.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5a  // bfmmla v26.4s, v2.8h, v0.8h\n"
+      "ldr q0, [x10, #0x60]\n"
+      ".inst 0x6e41ecee  // bfmmla v14.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e41ec76  // bfmmla v22.4s, v3.8h, v1.8h\n"
+      ".inst 0x6e41ec5e  // bfmmla v30.4s, v2.8h, v1.8h\n"
       "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e40eceb  // bfmmla v11.4s, v7.8h, v0.8h\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
-      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e40ec73  // bfmmla v19.4s, v3.8h, v0.8h\n"
+      ".inst 0x6e40ec5b  // bfmmla v27.4s, v2.8h, v0.8h\n"
+      ".inst 0x6e46ecef  // bfmmla v15.4s, v7.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ec5f  // bfmmla v31.4s, v2.8h, v6.8h\n"
       "211:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -3678,7 +3677,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "224:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
index 8ce3d1b..8b80c25 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -79,12 +79,12 @@
             switch (ci->get_cpu_model()) {
                 case CPUModel::A55r1:
                     return { 6.94 };
+                default:
+                    return { 14.53 };
                 case CPUModel::A510:
                     return { 8.94 };
                 case CPUModel::V1:
                     return { 29.26 };
-                default:
-                    return { 14.53 };
             }
         }
 
@@ -108,5 +108,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
index 1963654..b049ed4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
@@ -244,11 +244,11 @@
       "23:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 24f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
       "cbnz x15, 25f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20, LSL #1\n"
@@ -265,222 +265,222 @@
       "blt 27f\n"
       "26:"  // Height 1: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr d6, [x17, #0x20]\n"
-      "ldr x12, [x17, #0x28]\n"
+      "ldr d17, [x17, #0x20]\n"
+      "ldr x20, [x17, #0x28]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x38]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "ldr x12, [x17, #0x48]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x58]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "ldr x12, [x17, #0x68]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "ldr d7, [x17, #0x70]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x78]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "ldr x12, [x17, #0x88]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x98]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "ldr x12, [x17, #0xa8]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "ldr x12, [x17, #0xc8]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "ldr x12, [x17, #0xe8]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "ldr d6, [x17, #0x100]\n"
-      "ldr x12, [x17, #0x108]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "ldr d7, [x17, #0x110]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x118]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "ldr d6, [x17, #0x120]\n"
-      "ldr x12, [x17, #0x128]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "ldr d7, [x17, #0x130]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x138]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "ldr d6, [x17, #0x140]\n"
-      "ldr x12, [x17, #0x148]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "ldr d7, [x17, #0x150]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x158]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "ldr d6, [x17, #0x160]\n"
-      "ldr x12, [x17, #0x168]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "ldr d7, [x17, #0x170]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x178]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "ldr d6, [x17, #0x180]\n"
-      "ldr x12, [x17, #0x188]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "ldr d7, [x17, #0x190]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x198]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "ldr d6, [x17, #0x1a0]\n"
-      "ldr x12, [x17, #0x1a8]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "ldr d7, [x17, #0x1b0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x1b8]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "ldr d6, [x17, #0x1c0]\n"
-      "ldr x12, [x17, #0x1c8]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "ldr d7, [x17, #0x1d0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x1d8]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "ldr d6, [x17, #0x1e0]\n"
-      "ldr x12, [x17, #0x1e8]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "ldr d7, [x17, #0x1f0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x1f8]\n"
-      "mov v7.d[1], x11\n"
+      "ldr d16, [x17, #0x30]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x38]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "ldr d17, [x17, #0x40]\n"
+      "ldr x20, [x17, #0x48]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "ldr d16, [x17, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "ldr d17, [x17, #0x60]\n"
+      "ldr x20, [x17, #0x68]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "ldr d16, [x17, #0x70]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x78]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "ldr d17, [x17, #0x80]\n"
+      "ldr x20, [x17, #0x88]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "ldr d16, [x17, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "ldr d17, [x17, #0xa0]\n"
+      "ldr x20, [x17, #0xa8]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "ldr d16, [x17, #0xb0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "ldr d17, [x17, #0xc0]\n"
+      "ldr x20, [x17, #0xc8]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "ldr d16, [x17, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "ldr d17, [x17, #0xe0]\n"
+      "ldr x20, [x17, #0xe8]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "ldr d16, [x17, #0xf0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "ldr d17, [x17, #0x100]\n"
+      "ldr x20, [x17, #0x108]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "ldr d16, [x17, #0x110]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x118]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "ldr d17, [x17, #0x120]\n"
+      "ldr x20, [x17, #0x128]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "ldr d16, [x17, #0x130]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x138]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "ldr d17, [x17, #0x140]\n"
+      "ldr x20, [x17, #0x148]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "ldr d16, [x17, #0x150]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x158]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "ldr d17, [x17, #0x160]\n"
+      "ldr x20, [x17, #0x168]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "ldr d16, [x17, #0x170]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x178]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "ldr d17, [x17, #0x180]\n"
+      "ldr x20, [x17, #0x188]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "ldr d16, [x17, #0x190]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x198]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "ldr d17, [x17, #0x1a0]\n"
+      "ldr x20, [x17, #0x1a8]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "ldr d16, [x17, #0x1b0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x1b8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "ldr d17, [x17, #0x1c0]\n"
+      "ldr x20, [x17, #0x1c8]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "ldr d16, [x17, #0x1d0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x1d8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "ldr d17, [x17, #0x1e0]\n"
+      "ldr x20, [x17, #0x1e8]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "ldr d16, [x17, #0x1f0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x1f8]\n"
+      "mov v16.d[1], x20\n"
       "add x13, x13, #0x10\n"
       "add x17, x17, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
       "ldr d6, [x17, #0x0]\n"
-      "ldr x12, [x17, #0x8]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "ldr x20, [x17, #0x8]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
       "ldr d0, [x13, #0x0]\n"
       "sub x14, x14, #0x8\n"
       "ldr d7, [x17, #0x10]\n"
       "cmp x14, #0x10\n"
-      "ldr x10, [x13, #0x8]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x18]\n"
-      "mov v0.d[1], x10\n"
-      "mov v7.d[1], x11\n"
+      "ldr x21, [x13, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x17, #0x18]\n"
+      "mov v0.d[1], x21\n"
+      "mov v7.d[1], x20\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       "bge 26b\n"
       "27:"  // Height 1: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "ldr q17, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "ldr q6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "ldr q7, [x17, #0x1f0]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x17, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x17, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x17, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x17, #0x70]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x17, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x17, #0x90]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x17, #0xa0]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x17, #0xb0]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x17, #0xc0]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x17, #0xd0]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x17, #0xe0]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x17, #0xf0]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x17, #0x100]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x17, #0x110]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x17, #0x120]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x17, #0x130]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x17, #0x140]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x17, #0x150]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x17, #0x160]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x17, #0x170]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x17, #0x180]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x17, #0x190]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x17, #0x1a0]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x17, #0x1b0]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x17, #0x1c0]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x17, #0x1d0]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "ldr q17, [x17, #0x1e0]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "ldr q16, [x17, #0x1f0]\n"
       "add x13, x13, #0x10\n"
       "sub x14, x14, #0x8\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
       "add x17, x17, #0x200\n"
       "28:"  // Height 1: Multiply loop: Main loop skip
       "cbz x14, 30f\n"
       "29:"  // Height 1: Multiply loop: Odd block loop
       "ldr h0, [x13], #0x2\n"
       "sub x14, x14, #0x1\n"
-      "ldr q6, [x17, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "ldr q16, [x17, #0x0]\n"
+      "fmla v8.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x17, #0x10]\n"
+      "fmla v9.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x17, #0x20]\n"
+      "fmla v10.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
       "add x17, x17, #0x40\n"
       "cbnz x14, 29b\n"
       "30:"  // Height 1: Multiply loop: No odd multiplies
@@ -491,17 +491,17 @@
       "prfm pstl1keep, [x16, #0x0]\n"
       "tbz %x[flags], #1, 31f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v16.8h\n"
+      "fmin v9.8h, v9.8h, v16.8h\n"
+      "fmin v10.8h, v10.8h, v16.8h\n"
+      "fmin v11.8h, v11.8h, v16.8h\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmax v8.8h, v8.8h, v16.8h\n"
+      "fmax v9.8h, v9.8h, v16.8h\n"
+      "fmax v10.8h, v10.8h, v16.8h\n"
+      "fmax v11.8h, v11.8h, v16.8h\n"
       "31:"  // Height 1: No activation
       "cmp x8, #0x20\n"
       "bge 48f\n"
@@ -799,324 +799,324 @@
       "72:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 73f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
       "cbnz x15, 74f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20, LSL #1\n"
-      "add x9, x9, x20, LSL #1\n"
+      "add x12, x12, x20, LSL #1\n"
       "b 74f\n"
       "73:"  // Height 2: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20, LSL #1\n"
+      "add x12, x13, x21, LSL #1\n"
       "74:"  // Height 2: input setup done
       "cmp x14, #0x8\n"
       "blt 77f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x10\n"
-      "ldr q1, [x9, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
       "ldr q7, [x17, #0x10]\n"
       "blt 76f\n"
       "75:"  // Height 2: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr x12, [x17, #0x28]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr d6, [x17, #0x20]\n"
+      "ldr d17, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "ldr x12, [x17, #0x48]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x58]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "ldr x12, [x17, #0x68]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr d7, [x17, #0x70]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "ldr x12, [x17, #0x88]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x98]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "ldr x12, [x17, #0xa8]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "ldr x12, [x17, #0xc8]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "ldr x12, [x17, #0xe8]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "ldr d6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "ldr x12, [x17, #0x108]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "ldr d7, [x17, #0x110]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x118]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "ldr x12, [x17, #0x128]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "ldr d6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "ldr x11, [x17, #0x138]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "ldr d7, [x17, #0x130]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "ldr d6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "ldr x12, [x17, #0x148]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "ldr d7, [x17, #0x150]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x158]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "ldr x12, [x17, #0x168]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "ldr d6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "ldr x11, [x17, #0x178]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "ldr d7, [x17, #0x170]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "ldr d6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "ldr x12, [x17, #0x188]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "ldr d7, [x17, #0x190]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x198]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "ldr x12, [x17, #0x1a8]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "ldr d6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "ldr x11, [x17, #0x1b8]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "ldr d7, [x17, #0x1b0]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "ldr d6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "ldr x12, [x17, #0x1c8]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "ldr d7, [x17, #0x1d0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x1d8]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "ldr x12, [x17, #0x1e8]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "ldr d6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "ldr x11, [x17, #0x1f8]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "ldr d7, [x17, #0x1f0]\n"
-      "mov v6.d[1], x12\n"
+      "ldr d16, [x17, #0x30]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.8h, v17.8h, v1.h[0]\n"
+      "ldr d17, [x17, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "ldr x20, [x17, #0x48]\n"
+      "fmla v15.8h, v16.8h, v1.h[0]\n"
+      "ldr d16, [x17, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v12.8h, v17.8h, v1.h[1]\n"
+      "ldr d17, [x17, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v13.8h, v16.8h, v1.h[1]\n"
+      "ldr d16, [x17, #0x70]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.8h, v17.8h, v1.h[1]\n"
+      "ldr d17, [x17, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "ldr x20, [x17, #0x88]\n"
+      "fmla v15.8h, v16.8h, v1.h[1]\n"
+      "ldr d16, [x17, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v12.8h, v17.8h, v1.h[2]\n"
+      "ldr d17, [x17, #0xa0]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v13.8h, v16.8h, v1.h[2]\n"
+      "ldr d16, [x17, #0xb0]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.8h, v17.8h, v1.h[2]\n"
+      "ldr d17, [x17, #0xc0]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "ldr x20, [x17, #0xc8]\n"
+      "fmla v15.8h, v16.8h, v1.h[2]\n"
+      "ldr d16, [x17, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v12.8h, v17.8h, v1.h[3]\n"
+      "ldr d17, [x17, #0xe0]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v13.8h, v16.8h, v1.h[3]\n"
+      "ldr d16, [x17, #0xf0]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.8h, v17.8h, v1.h[3]\n"
+      "ldr d17, [x17, #0x100]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "ldr x20, [x17, #0x108]\n"
+      "fmla v15.8h, v16.8h, v1.h[3]\n"
+      "ldr d16, [x17, #0x110]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x118]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "ldr x21, [x17, #0x128]\n"
+      "fmla v12.8h, v17.8h, v1.h[4]\n"
+      "ldr d17, [x17, #0x120]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "ldr x20, [x17, #0x138]\n"
+      "fmla v13.8h, v16.8h, v1.h[4]\n"
+      "ldr d16, [x17, #0x130]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.8h, v17.8h, v1.h[4]\n"
+      "ldr d17, [x17, #0x140]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "ldr x20, [x17, #0x148]\n"
+      "fmla v15.8h, v16.8h, v1.h[4]\n"
+      "ldr d16, [x17, #0x150]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x158]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "ldr x21, [x17, #0x168]\n"
+      "fmla v12.8h, v17.8h, v1.h[5]\n"
+      "ldr d17, [x17, #0x160]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "ldr x20, [x17, #0x178]\n"
+      "fmla v13.8h, v16.8h, v1.h[5]\n"
+      "ldr d16, [x17, #0x170]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.8h, v17.8h, v1.h[5]\n"
+      "ldr d17, [x17, #0x180]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "ldr x20, [x17, #0x188]\n"
+      "fmla v15.8h, v16.8h, v1.h[5]\n"
+      "ldr d16, [x17, #0x190]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x198]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "ldr x21, [x17, #0x1a8]\n"
+      "fmla v12.8h, v17.8h, v1.h[6]\n"
+      "ldr d17, [x17, #0x1a0]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "ldr x20, [x17, #0x1b8]\n"
+      "fmla v13.8h, v16.8h, v1.h[6]\n"
+      "ldr d16, [x17, #0x1b0]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.8h, v17.8h, v1.h[6]\n"
+      "ldr d17, [x17, #0x1c0]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "ldr x20, [x17, #0x1c8]\n"
+      "fmla v15.8h, v16.8h, v1.h[6]\n"
+      "ldr d16, [x17, #0x1d0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x1d8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "ldr x21, [x17, #0x1e8]\n"
+      "fmla v12.8h, v17.8h, v1.h[7]\n"
+      "ldr d17, [x17, #0x1e0]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "ldr x20, [x17, #0x1f8]\n"
+      "fmla v13.8h, v16.8h, v1.h[7]\n"
+      "ldr d16, [x17, #0x1f0]\n"
+      "mov v17.d[1], x21\n"
       "add x13, x13, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "add x9, x9, #0x10\n"
+      "mov v16.d[1], x20\n"
+      "add x12, x12, #0x10\n"
       "add x17, x17, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v14.8h, v17.8h, v1.h[7]\n"
       "ldr d6, [x17, #0x0]\n"
-      "ldr x12, [x17, #0x8]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "ldr x21, [x17, #0x8]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
       "ldr d0, [x13, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "fmla v15.8h, v16.8h, v1.h[7]\n"
+      "ldr d1, [x12, #0x0]\n"
       "sub x14, x14, #0x8\n"
       "ldr d7, [x17, #0x10]\n"
       "cmp x14, #0x10\n"
-      "ldr x10, [x13, #0x8]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x28, [x9, #0x8]\n"
-      "mov v0.d[1], x10\n"
-      "ldr x11, [x17, #0x18]\n"
-      "mov v1.d[1], x28\n"
+      "ldr x20, [x13, #0x8]\n"
+      "mov v6.d[1], x21\n"
+      "ldr x21, [x12, #0x8]\n"
+      "mov v0.d[1], x20\n"
+      "ldr x20, [x17, #0x18]\n"
+      "mov v1.d[1], x21\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "bge 75b\n"
       "76:"  // Height 2: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "ldr q17, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
       "sub x14, x14, #0x8\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v14.8h, v17.8h, v1.h[0]\n"
+      "ldr q17, [x17, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "ldr q6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "ldr q7, [x17, #0x1f0]\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v15.8h, v16.8h, v1.h[0]\n"
+      "ldr q16, [x17, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v12.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x17, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "fmla v13.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x17, #0x70]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "fmla v14.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x17, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "fmla v15.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x17, #0x90]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "fmla v12.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x17, #0xa0]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "fmla v13.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x17, #0xb0]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "fmla v14.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x17, #0xc0]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "fmla v15.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x17, #0xd0]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "fmla v12.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x17, #0xe0]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "fmla v13.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x17, #0xf0]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "fmla v14.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x17, #0x100]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "fmla v15.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x17, #0x110]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "fmla v12.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x17, #0x120]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "fmla v13.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x17, #0x130]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "fmla v14.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x17, #0x140]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "fmla v15.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x17, #0x150]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "fmla v12.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x17, #0x160]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "fmla v13.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x17, #0x170]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "fmla v14.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x17, #0x180]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "fmla v15.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x17, #0x190]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "fmla v12.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x17, #0x1a0]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "fmla v13.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x17, #0x1b0]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "fmla v14.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x17, #0x1c0]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "fmla v15.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x17, #0x1d0]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "fmla v12.8h, v17.8h, v1.h[7]\n"
+      "ldr q17, [x17, #0x1e0]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "fmla v13.8h, v16.8h, v1.h[7]\n"
+      "ldr q16, [x17, #0x1f0]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
       "add x17, x17, #0x200\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v14.8h, v17.8h, v1.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
+      "fmla v15.8h, v16.8h, v1.h[7]\n"
       "77:"  // Height 2: Multiply loop: Main loop skip
       "cbz x14, 79f\n"
       "78:"  // Height 2: Multiply loop: Odd block loop
-      "ldr h0, [x13], #0x2\n"
+      "ldr h1, [x13], #0x2\n"
       "sub x14, x14, #0x1\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr q6, [x17, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr h0, [x12], #0x2\n"
+      "ldr q17, [x17, #0x0]\n"
+      "fmla v8.8h, v17.8h, v1.h[0]\n"
+      "ldr q16, [x17, #0x10]\n"
+      "fmla v12.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x17, #0x20]\n"
+      "fmla v9.8h, v16.8h, v1.h[0]\n"
+      "fmla v13.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v10.8h, v17.8h, v1.h[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v14.8h, v17.8h, v0.h[0]\n"
+      "fmla v11.8h, v16.8h, v1.h[0]\n"
+      "fmla v15.8h, v16.8h, v0.h[0]\n"
       "cbnz x14, 78b\n"
       "79:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1129,25 +1129,25 @@
       "prfm pstl1keep, [x25, #0x0]\n"
       "tbz %x[flags], #1, 80f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmin v12.8h, v12.8h, v0.8h\n"
-      "fmin v13.8h, v13.8h, v0.8h\n"
-      "fmin v14.8h, v14.8h, v0.8h\n"
-      "fmin v15.8h, v15.8h, v0.8h\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v16.8h\n"
+      "fmin v9.8h, v9.8h, v16.8h\n"
+      "fmin v10.8h, v10.8h, v16.8h\n"
+      "fmin v11.8h, v11.8h, v16.8h\n"
+      "fmin v12.8h, v12.8h, v16.8h\n"
+      "fmin v13.8h, v13.8h, v16.8h\n"
+      "fmin v14.8h, v14.8h, v16.8h\n"
+      "fmin v15.8h, v15.8h, v16.8h\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v0.8h\n"
-      "fmax v13.8h, v13.8h, v0.8h\n"
-      "fmax v14.8h, v14.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v0.8h\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmax v8.8h, v8.8h, v16.8h\n"
+      "fmax v9.8h, v9.8h, v16.8h\n"
+      "fmax v10.8h, v10.8h, v16.8h\n"
+      "fmax v11.8h, v11.8h, v16.8h\n"
+      "fmax v12.8h, v12.8h, v16.8h\n"
+      "fmax v13.8h, v13.8h, v16.8h\n"
+      "fmax v14.8h, v14.8h, v16.8h\n"
+      "fmax v15.8h, v15.8h, v16.8h\n"
       "80:"  // Height 2: No activation
       "cmp x8, #0x20\n"
       "bge 97f\n"
@@ -1526,404 +1526,404 @@
       "121:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 122f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
       "cbnz x15, 123f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20, LSL #1\n"
-      "add x9, x9, x20, LSL #1\n"
-      "add x27, x27, x20, LSL #1\n"
+      "add x12, x12, x20, LSL #1\n"
+      "add x11, x11, x20, LSL #1\n"
       "b 123f\n"
       "122:"  // Height 3: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20, LSL #1\n"
-      "add x27, x9, x20, LSL #1\n"
+      "add x12, x13, x21, LSL #1\n"
+      "add x11, x12, x21, LSL #1\n"
       "123:"  // Height 3: input setup done
       "cmp x14, #0x8\n"
       "blt 126f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x10\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
       "ldr q7, [x17, #0x10]\n"
       "blt 125f\n"
       "124:"  // Height 3: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr x12, [x17, #0x28]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr d6, [x17, #0x20]\n"
+      "ldr d21, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v21.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr x12, [x17, #0x48]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x58]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr x12, [x17, #0x68]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr x12, [x17, #0x88]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "ldr d7, [x17, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr x12, [x17, #0xa8]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "ldr x12, [x17, #0xc8]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "ldr x12, [x17, #0xe8]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "ldr x12, [x17, #0x108]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "ldr x11, [x17, #0x118]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "ldr d6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "ldr x12, [x17, #0x128]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "ldr d7, [x17, #0x110]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "ldr x11, [x17, #0x138]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "ldr d6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "ldr x12, [x17, #0x148]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "ldr d7, [x17, #0x130]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "ldr x11, [x17, #0x158]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "ldr d6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "ldr x12, [x17, #0x168]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "ldr d7, [x17, #0x150]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "ldr x11, [x17, #0x178]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "ldr d6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "ldr x12, [x17, #0x188]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "ldr d7, [x17, #0x170]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "ldr x11, [x17, #0x198]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "ldr d6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "ldr x12, [x17, #0x1a8]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "ldr d7, [x17, #0x190]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "ldr x11, [x17, #0x1b8]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "ldr d6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "ldr x12, [x17, #0x1c8]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "ldr d7, [x17, #0x1b0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "ldr x11, [x17, #0x1d8]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "ldr d6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "ldr x12, [x17, #0x1e8]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "ldr d7, [x17, #0x1d0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "ldr x11, [x17, #0x1f8]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "ldr d6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "ldr d20, [x17, #0x30]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.8h, v21.8h, v0.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
+      "ldr x20, [x17, #0x58]\n"
+      "fmla v18.8h, v21.8h, v2.h[0]\n"
+      "ldr d21, [x17, #0x40]\n"
+      "fmla v11.8h, v20.8h, v0.h[0]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v19.8h, v20.8h, v2.h[0]\n"
+      "ldr d20, [x17, #0x50]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.8h, v21.8h, v0.h[1]\n"
+      "fmla v12.8h, v21.8h, v1.h[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v16.8h, v21.8h, v2.h[1]\n"
+      "ldr d21, [x17, #0x60]\n"
+      "fmla v9.8h, v20.8h, v0.h[1]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.8h, v20.8h, v1.h[1]\n"
+      "ldr x21, [x17, #0x88]\n"
+      "fmla v17.8h, v20.8h, v2.h[1]\n"
+      "ldr d20, [x17, #0x70]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.8h, v21.8h, v0.h[1]\n"
+      "fmla v14.8h, v21.8h, v1.h[1]\n"
+      "ldr x20, [x17, #0x98]\n"
+      "fmla v18.8h, v21.8h, v2.h[1]\n"
+      "ldr d21, [x17, #0x80]\n"
+      "fmla v11.8h, v20.8h, v0.h[1]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.8h, v20.8h, v1.h[1]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v19.8h, v20.8h, v2.h[1]\n"
+      "ldr d20, [x17, #0x90]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.8h, v21.8h, v0.h[2]\n"
+      "fmla v12.8h, v21.8h, v1.h[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v16.8h, v21.8h, v2.h[2]\n"
+      "ldr d21, [x17, #0xa0]\n"
+      "fmla v9.8h, v20.8h, v0.h[2]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.8h, v20.8h, v1.h[2]\n"
+      "ldr x21, [x17, #0xc8]\n"
+      "fmla v17.8h, v20.8h, v2.h[2]\n"
+      "ldr d20, [x17, #0xb0]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.8h, v21.8h, v0.h[2]\n"
+      "fmla v14.8h, v21.8h, v1.h[2]\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "fmla v18.8h, v21.8h, v2.h[2]\n"
+      "ldr d21, [x17, #0xc0]\n"
+      "fmla v11.8h, v20.8h, v0.h[2]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.8h, v20.8h, v1.h[2]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v19.8h, v20.8h, v2.h[2]\n"
+      "ldr d20, [x17, #0xd0]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.8h, v21.8h, v0.h[3]\n"
+      "fmla v12.8h, v21.8h, v1.h[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v16.8h, v21.8h, v2.h[3]\n"
+      "ldr d21, [x17, #0xe0]\n"
+      "fmla v9.8h, v20.8h, v0.h[3]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.8h, v20.8h, v1.h[3]\n"
+      "ldr x21, [x17, #0x108]\n"
+      "fmla v17.8h, v20.8h, v2.h[3]\n"
+      "ldr d20, [x17, #0xf0]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.8h, v21.8h, v0.h[3]\n"
+      "fmla v14.8h, v21.8h, v1.h[3]\n"
+      "ldr x20, [x17, #0x118]\n"
+      "fmla v18.8h, v21.8h, v2.h[3]\n"
+      "ldr d21, [x17, #0x100]\n"
+      "fmla v11.8h, v20.8h, v0.h[3]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.8h, v20.8h, v1.h[3]\n"
+      "ldr x21, [x17, #0x128]\n"
+      "fmla v19.8h, v20.8h, v2.h[3]\n"
+      "ldr d20, [x17, #0x110]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.8h, v21.8h, v0.h[4]\n"
+      "fmla v12.8h, v21.8h, v1.h[4]\n"
+      "ldr x20, [x17, #0x138]\n"
+      "fmla v16.8h, v21.8h, v2.h[4]\n"
+      "ldr d21, [x17, #0x120]\n"
+      "fmla v9.8h, v20.8h, v0.h[4]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.8h, v20.8h, v1.h[4]\n"
+      "ldr x21, [x17, #0x148]\n"
+      "fmla v17.8h, v20.8h, v2.h[4]\n"
+      "ldr d20, [x17, #0x130]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.8h, v21.8h, v0.h[4]\n"
+      "fmla v14.8h, v21.8h, v1.h[4]\n"
+      "ldr x20, [x17, #0x158]\n"
+      "fmla v18.8h, v21.8h, v2.h[4]\n"
+      "ldr d21, [x17, #0x140]\n"
+      "fmla v11.8h, v20.8h, v0.h[4]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.8h, v20.8h, v1.h[4]\n"
+      "ldr x21, [x17, #0x168]\n"
+      "fmla v19.8h, v20.8h, v2.h[4]\n"
+      "ldr d20, [x17, #0x150]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.8h, v21.8h, v0.h[5]\n"
+      "fmla v12.8h, v21.8h, v1.h[5]\n"
+      "ldr x20, [x17, #0x178]\n"
+      "fmla v16.8h, v21.8h, v2.h[5]\n"
+      "ldr d21, [x17, #0x160]\n"
+      "fmla v9.8h, v20.8h, v0.h[5]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.8h, v20.8h, v1.h[5]\n"
+      "ldr x21, [x17, #0x188]\n"
+      "fmla v17.8h, v20.8h, v2.h[5]\n"
+      "ldr d20, [x17, #0x170]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.8h, v21.8h, v0.h[5]\n"
+      "fmla v14.8h, v21.8h, v1.h[5]\n"
+      "ldr x20, [x17, #0x198]\n"
+      "fmla v18.8h, v21.8h, v2.h[5]\n"
+      "ldr d21, [x17, #0x180]\n"
+      "fmla v11.8h, v20.8h, v0.h[5]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.8h, v20.8h, v1.h[5]\n"
+      "ldr x21, [x17, #0x1a8]\n"
+      "fmla v19.8h, v20.8h, v2.h[5]\n"
+      "ldr d20, [x17, #0x190]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.8h, v21.8h, v0.h[6]\n"
+      "fmla v12.8h, v21.8h, v1.h[6]\n"
+      "ldr x20, [x17, #0x1b8]\n"
+      "fmla v16.8h, v21.8h, v2.h[6]\n"
+      "ldr d21, [x17, #0x1a0]\n"
+      "fmla v9.8h, v20.8h, v0.h[6]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.8h, v20.8h, v1.h[6]\n"
+      "ldr x21, [x17, #0x1c8]\n"
+      "fmla v17.8h, v20.8h, v2.h[6]\n"
+      "ldr d20, [x17, #0x1b0]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.8h, v21.8h, v0.h[6]\n"
+      "fmla v14.8h, v21.8h, v1.h[6]\n"
+      "ldr x20, [x17, #0x1d8]\n"
+      "fmla v18.8h, v21.8h, v2.h[6]\n"
+      "ldr d21, [x17, #0x1c0]\n"
+      "fmla v11.8h, v20.8h, v0.h[6]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.8h, v20.8h, v1.h[6]\n"
+      "ldr x21, [x17, #0x1e8]\n"
+      "fmla v19.8h, v20.8h, v2.h[6]\n"
+      "ldr d20, [x17, #0x1d0]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.8h, v21.8h, v0.h[7]\n"
+      "fmla v12.8h, v21.8h, v1.h[7]\n"
+      "ldr x20, [x17, #0x1f8]\n"
+      "fmla v16.8h, v21.8h, v2.h[7]\n"
+      "ldr d21, [x17, #0x1e0]\n"
+      "fmla v9.8h, v20.8h, v0.h[7]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.8h, v20.8h, v1.h[7]\n"
       "add x13, x13, #0x10\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "ldr d7, [x17, #0x1f0]\n"
-      "mov v7.d[1], x11\n"
-      "add x9, x9, #0x10\n"
-      "add x27, x27, #0x10\n"
+      "fmla v17.8h, v20.8h, v2.h[7]\n"
+      "ldr d20, [x17, #0x1f0]\n"
+      "mov v20.d[1], x20\n"
+      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
       "add x17, x17, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "ldr x12, [x17, #0x8]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "ldr x10, [x13, #0x8]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v10.8h, v21.8h, v0.h[7]\n"
+      "ldr x20, [x17, #0x8]\n"
+      "fmla v14.8h, v21.8h, v1.h[7]\n"
+      "ldr x23, [x13, #0x8]\n"
+      "fmla v18.8h, v21.8h, v2.h[7]\n"
       "ldr d6, [x17, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v11.8h, v20.8h, v0.h[7]\n"
       "ldr d0, [x13, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "ldr d1, [x9, #0x0]\n"
-      "ldr x28, [x9, #0x8]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "fmla v15.8h, v20.8h, v1.h[7]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "ldr x22, [x12, #0x8]\n"
+      "fmla v19.8h, v20.8h, v2.h[7]\n"
+      "ldr d2, [x11, #0x0]\n"
       "sub x14, x14, #0x8\n"
       "ldr d7, [x17, #0x10]\n"
       "cmp x14, #0x10\n"
-      "ldr x26, [x27, #0x8]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x18]\n"
-      "mov v0.d[1], x10\n"
+      "ldr x21, [x11, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x17, #0x18]\n"
+      "mov v0.d[1], x23\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "mov v1.d[1], x28\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "mov v2.d[1], x26\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "mov v7.d[1], x11\n"
+      "mov v1.d[1], x22\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "mov v2.d[1], x21\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "mov v7.d[1], x20\n"
       "bge 124b\n"
       "125:"  // Height 3: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "ldr q21, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
       "sub x14, x14, #0x8\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr q20, [x17, #0x30]\n"
+      "fmla v10.8h, v21.8h, v0.h[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "ldr q6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "ldr q7, [x17, #0x1f0]\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v18.8h, v21.8h, v2.h[0]\n"
+      "ldr q21, [x17, #0x40]\n"
+      "fmla v11.8h, v20.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v2.h[0]\n"
+      "ldr q20, [x17, #0x50]\n"
+      "fmla v8.8h, v21.8h, v0.h[1]\n"
+      "fmla v12.8h, v21.8h, v1.h[1]\n"
+      "fmla v16.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x17, #0x60]\n"
+      "fmla v9.8h, v20.8h, v0.h[1]\n"
+      "fmla v13.8h, v20.8h, v1.h[1]\n"
+      "fmla v17.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x17, #0x70]\n"
+      "fmla v10.8h, v21.8h, v0.h[1]\n"
+      "fmla v14.8h, v21.8h, v1.h[1]\n"
+      "fmla v18.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x17, #0x80]\n"
+      "fmla v11.8h, v20.8h, v0.h[1]\n"
+      "fmla v15.8h, v20.8h, v1.h[1]\n"
+      "fmla v19.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x17, #0x90]\n"
+      "fmla v8.8h, v21.8h, v0.h[2]\n"
+      "fmla v12.8h, v21.8h, v1.h[2]\n"
+      "fmla v16.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x17, #0xa0]\n"
+      "fmla v9.8h, v20.8h, v0.h[2]\n"
+      "fmla v13.8h, v20.8h, v1.h[2]\n"
+      "fmla v17.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x17, #0xb0]\n"
+      "fmla v10.8h, v21.8h, v0.h[2]\n"
+      "fmla v14.8h, v21.8h, v1.h[2]\n"
+      "fmla v18.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x17, #0xc0]\n"
+      "fmla v11.8h, v20.8h, v0.h[2]\n"
+      "fmla v15.8h, v20.8h, v1.h[2]\n"
+      "fmla v19.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x17, #0xd0]\n"
+      "fmla v8.8h, v21.8h, v0.h[3]\n"
+      "fmla v12.8h, v21.8h, v1.h[3]\n"
+      "fmla v16.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x17, #0xe0]\n"
+      "fmla v9.8h, v20.8h, v0.h[3]\n"
+      "fmla v13.8h, v20.8h, v1.h[3]\n"
+      "fmla v17.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x17, #0xf0]\n"
+      "fmla v10.8h, v21.8h, v0.h[3]\n"
+      "fmla v14.8h, v21.8h, v1.h[3]\n"
+      "fmla v18.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x17, #0x100]\n"
+      "fmla v11.8h, v20.8h, v0.h[3]\n"
+      "fmla v15.8h, v20.8h, v1.h[3]\n"
+      "fmla v19.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x17, #0x110]\n"
+      "fmla v8.8h, v21.8h, v0.h[4]\n"
+      "fmla v12.8h, v21.8h, v1.h[4]\n"
+      "fmla v16.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x17, #0x120]\n"
+      "fmla v9.8h, v20.8h, v0.h[4]\n"
+      "fmla v13.8h, v20.8h, v1.h[4]\n"
+      "fmla v17.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x17, #0x130]\n"
+      "fmla v10.8h, v21.8h, v0.h[4]\n"
+      "fmla v14.8h, v21.8h, v1.h[4]\n"
+      "fmla v18.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x17, #0x140]\n"
+      "fmla v11.8h, v20.8h, v0.h[4]\n"
+      "fmla v15.8h, v20.8h, v1.h[4]\n"
+      "fmla v19.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x17, #0x150]\n"
+      "fmla v8.8h, v21.8h, v0.h[5]\n"
+      "fmla v12.8h, v21.8h, v1.h[5]\n"
+      "fmla v16.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x17, #0x160]\n"
+      "fmla v9.8h, v20.8h, v0.h[5]\n"
+      "fmla v13.8h, v20.8h, v1.h[5]\n"
+      "fmla v17.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x17, #0x170]\n"
+      "fmla v10.8h, v21.8h, v0.h[5]\n"
+      "fmla v14.8h, v21.8h, v1.h[5]\n"
+      "fmla v18.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x17, #0x180]\n"
+      "fmla v11.8h, v20.8h, v0.h[5]\n"
+      "fmla v15.8h, v20.8h, v1.h[5]\n"
+      "fmla v19.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x17, #0x190]\n"
+      "fmla v8.8h, v21.8h, v0.h[6]\n"
+      "fmla v12.8h, v21.8h, v1.h[6]\n"
+      "fmla v16.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x17, #0x1a0]\n"
+      "fmla v9.8h, v20.8h, v0.h[6]\n"
+      "fmla v13.8h, v20.8h, v1.h[6]\n"
+      "fmla v17.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x17, #0x1b0]\n"
+      "fmla v10.8h, v21.8h, v0.h[6]\n"
+      "fmla v14.8h, v21.8h, v1.h[6]\n"
+      "fmla v18.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x17, #0x1c0]\n"
+      "fmla v11.8h, v20.8h, v0.h[6]\n"
+      "fmla v15.8h, v20.8h, v1.h[6]\n"
+      "fmla v19.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x17, #0x1d0]\n"
+      "fmla v8.8h, v21.8h, v0.h[7]\n"
+      "fmla v12.8h, v21.8h, v1.h[7]\n"
+      "fmla v16.8h, v21.8h, v2.h[7]\n"
+      "ldr q21, [x17, #0x1e0]\n"
+      "fmla v9.8h, v20.8h, v0.h[7]\n"
+      "fmla v13.8h, v20.8h, v1.h[7]\n"
+      "fmla v17.8h, v20.8h, v2.h[7]\n"
+      "ldr q20, [x17, #0x1f0]\n"
+      "fmla v10.8h, v21.8h, v0.h[7]\n"
       "add x17, x17, #0x200\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v14.8h, v21.8h, v1.h[7]\n"
+      "fmla v18.8h, v21.8h, v2.h[7]\n"
+      "fmla v11.8h, v20.8h, v0.h[7]\n"
+      "fmla v15.8h, v20.8h, v1.h[7]\n"
+      "fmla v19.8h, v20.8h, v2.h[7]\n"
       "126:"  // Height 3: Multiply loop: Main loop skip
       "cbz x14, 128f\n"
       "127:"  // Height 3: Multiply loop: Odd block loop
-      "ldr h0, [x13], #0x2\n"
+      "ldr h2, [x13], #0x2\n"
       "sub x14, x14, #0x1\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
-      "ldr q6, [x17, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h0, [x11], #0x2\n"
+      "ldr q21, [x17, #0x0]\n"
+      "fmla v8.8h, v21.8h, v2.h[0]\n"
+      "ldr q20, [x17, #0x10]\n"
+      "fmla v12.8h, v21.8h, v1.h[0]\n"
+      "fmla v16.8h, v21.8h, v0.h[0]\n"
+      "ldr q21, [x17, #0x20]\n"
+      "fmla v9.8h, v20.8h, v2.h[0]\n"
+      "fmla v13.8h, v20.8h, v1.h[0]\n"
+      "fmla v17.8h, v20.8h, v0.h[0]\n"
+      "ldr q20, [x17, #0x30]\n"
+      "fmla v10.8h, v21.8h, v2.h[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
+      "fmla v18.8h, v21.8h, v0.h[0]\n"
+      "fmla v11.8h, v20.8h, v2.h[0]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v0.h[0]\n"
       "cbnz x14, 127b\n"
       "128:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1938,33 +1938,33 @@
       "prfm pstl1keep, [x24, #0x0]\n"
       "tbz %x[flags], #1, 129f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmin v12.8h, v12.8h, v0.8h\n"
-      "fmin v13.8h, v13.8h, v0.8h\n"
-      "fmin v14.8h, v14.8h, v0.8h\n"
-      "fmin v15.8h, v15.8h, v0.8h\n"
-      "fmin v16.8h, v16.8h, v0.8h\n"
-      "fmin v17.8h, v17.8h, v0.8h\n"
-      "fmin v18.8h, v18.8h, v0.8h\n"
-      "fmin v19.8h, v19.8h, v0.8h\n"
+      "ld1r { v20.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v20.8h\n"
+      "fmin v9.8h, v9.8h, v20.8h\n"
+      "fmin v10.8h, v10.8h, v20.8h\n"
+      "fmin v11.8h, v11.8h, v20.8h\n"
+      "fmin v12.8h, v12.8h, v20.8h\n"
+      "fmin v13.8h, v13.8h, v20.8h\n"
+      "fmin v14.8h, v14.8h, v20.8h\n"
+      "fmin v15.8h, v15.8h, v20.8h\n"
+      "fmin v16.8h, v16.8h, v20.8h\n"
+      "fmin v17.8h, v17.8h, v20.8h\n"
+      "fmin v18.8h, v18.8h, v20.8h\n"
+      "fmin v19.8h, v19.8h, v20.8h\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v0.8h\n"
-      "fmax v13.8h, v13.8h, v0.8h\n"
-      "fmax v14.8h, v14.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v0.8h\n"
-      "fmax v16.8h, v16.8h, v0.8h\n"
-      "fmax v17.8h, v17.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v0.8h\n"
-      "fmax v19.8h, v19.8h, v0.8h\n"
+      "ld1r { v20.8h }, [x20]\n"
+      "fmax v8.8h, v8.8h, v20.8h\n"
+      "fmax v9.8h, v9.8h, v20.8h\n"
+      "fmax v10.8h, v10.8h, v20.8h\n"
+      "fmax v11.8h, v11.8h, v20.8h\n"
+      "fmax v12.8h, v12.8h, v20.8h\n"
+      "fmax v13.8h, v13.8h, v20.8h\n"
+      "fmax v14.8h, v14.8h, v20.8h\n"
+      "fmax v15.8h, v15.8h, v20.8h\n"
+      "fmax v16.8h, v16.8h, v20.8h\n"
+      "fmax v17.8h, v17.8h, v20.8h\n"
+      "fmax v18.8h, v18.8h, v20.8h\n"
+      "fmax v19.8h, v19.8h, v20.8h\n"
       "129:"  // Height 3: No activation
       "cmp x8, #0x20\n"
       "bge 146f\n"
@@ -2424,484 +2424,484 @@
       "170:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 171f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
       "cbnz x15, 172f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20, LSL #1\n"
-      "add x9, x9, x20, LSL #1\n"
-      "add x27, x27, x20, LSL #1\n"
-      "add x25, x25, x20, LSL #1\n"
+      "add x12, x12, x20, LSL #1\n"
+      "add x11, x11, x20, LSL #1\n"
+      "add x10, x10, x20, LSL #1\n"
       "b 172f\n"
       "171:"  // Height 4: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20, LSL #1\n"
-      "add x27, x9, x20, LSL #1\n"
-      "add x25, x27, x20, LSL #1\n"
+      "add x12, x13, x21, LSL #1\n"
+      "add x11, x12, x21, LSL #1\n"
+      "add x10, x11, x21, LSL #1\n"
       "172:"  // Height 4: input setup done
       "cmp x14, #0x8\n"
       "blt 175f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x10\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
       "ldr q7, [x17, #0x10]\n"
       "blt 174f\n"
       "173:"  // Height 4: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr x12, [x17, #0x28]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "ldr d6, [x17, #0x20]\n"
+      "ldr d25, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v25.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr x12, [x17, #0x48]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x58]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "add x27, x27, #0x10\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr x12, [x17, #0x68]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "add x25, x25, #0x10\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "ldr x10, [x13, #0x8]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr x12, [x17, #0x88]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "ldr x28, [x9, #0x8]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "ldr d7, [x17, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "ldr x26, [x27, #0x8]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr x12, [x17, #0xa8]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "ldr x24, [x25, #0x8]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "ldr d24, [x17, #0x30]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.8h, v25.8h, v0.h[0]\n"
+      "fmla v14.8h, v25.8h, v1.h[0]\n"
+      "ldr x20, [x17, #0x58]\n"
+      "fmla v18.8h, v25.8h, v2.h[0]\n"
+      "add x11, x11, #0x10\n"
+      "fmla v22.8h, v25.8h, v3.h[0]\n"
+      "ldr d25, [x17, #0x40]\n"
+      "fmla v11.8h, v24.8h, v0.h[0]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.8h, v24.8h, v1.h[0]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v19.8h, v24.8h, v2.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v23.8h, v24.8h, v3.h[0]\n"
+      "ldr d24, [x17, #0x50]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.8h, v25.8h, v0.h[1]\n"
+      "fmla v12.8h, v25.8h, v1.h[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v16.8h, v25.8h, v2.h[1]\n"
+      "ldr x25, [x13, #0x8]\n"
+      "fmla v20.8h, v25.8h, v3.h[1]\n"
+      "ldr d25, [x17, #0x60]\n"
+      "fmla v9.8h, v24.8h, v0.h[1]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.8h, v24.8h, v1.h[1]\n"
+      "ldr x21, [x17, #0x88]\n"
+      "fmla v17.8h, v24.8h, v2.h[1]\n"
+      "ldr x24, [x12, #0x8]\n"
+      "fmla v21.8h, v24.8h, v3.h[1]\n"
+      "ldr d24, [x17, #0x70]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.8h, v25.8h, v0.h[1]\n"
+      "fmla v14.8h, v25.8h, v1.h[1]\n"
+      "ldr x20, [x17, #0x98]\n"
+      "fmla v18.8h, v25.8h, v2.h[1]\n"
+      "ldr x23, [x11, #0x8]\n"
+      "fmla v22.8h, v25.8h, v3.h[1]\n"
+      "ldr d25, [x17, #0x80]\n"
+      "fmla v11.8h, v24.8h, v0.h[1]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.8h, v24.8h, v1.h[1]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v19.8h, v24.8h, v2.h[1]\n"
+      "ldr x22, [x10, #0x8]\n"
+      "fmla v23.8h, v24.8h, v3.h[1]\n"
+      "ldr d24, [x17, #0x90]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.8h, v25.8h, v0.h[2]\n"
+      "fmla v12.8h, v25.8h, v1.h[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v16.8h, v25.8h, v2.h[2]\n"
       "sub x14, x14, #0x8\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "ldr x12, [x17, #0xc8]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v20.8h, v25.8h, v3.h[2]\n"
+      "ldr d25, [x17, #0xa0]\n"
+      "fmla v9.8h, v24.8h, v0.h[2]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.8h, v24.8h, v1.h[2]\n"
+      "ldr x21, [x17, #0xc8]\n"
+      "fmla v17.8h, v24.8h, v2.h[2]\n"
       "cmp x14, #0x10\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v21.8h, v24.8h, v3.h[2]\n"
+      "ldr d24, [x17, #0xb0]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.8h, v25.8h, v0.h[2]\n"
+      "fmla v14.8h, v25.8h, v1.h[2]\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "fmla v18.8h, v25.8h, v2.h[2]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "ldr x12, [x17, #0xe8]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "ldr x12, [x17, #0x108]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "ldr x11, [x17, #0x118]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "ldr d6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "ldr x12, [x17, #0x128]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "ldr d7, [x17, #0x110]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "ldr x11, [x17, #0x138]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "ldr d6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "ldr x12, [x17, #0x148]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "ldr d7, [x17, #0x130]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "ldr x11, [x17, #0x158]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "ldr d6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "ldr x12, [x17, #0x168]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "ldr d7, [x17, #0x150]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "ldr x11, [x17, #0x178]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "ldr d6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "ldr x12, [x17, #0x188]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "ldr d7, [x17, #0x170]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "ldr x11, [x17, #0x198]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "ldr d6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "ldr x12, [x17, #0x1a8]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "ldr d7, [x17, #0x190]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "ldr x11, [x17, #0x1b8]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "ldr d6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "ldr x12, [x17, #0x1c8]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "ldr d7, [x17, #0x1b0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "ldr x11, [x17, #0x1d8]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "ldr d6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "ldr x12, [x17, #0x1e8]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "ldr d7, [x17, #0x1d0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "ldr x11, [x17, #0x1f8]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "ldr d6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "ldr d7, [x17, #0x1f0]\n"
-      "mov v7.d[1], x11\n"
+      "fmla v22.8h, v25.8h, v3.h[2]\n"
+      "ldr d25, [x17, #0xc0]\n"
+      "fmla v11.8h, v24.8h, v0.h[2]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.8h, v24.8h, v1.h[2]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v19.8h, v24.8h, v2.h[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v23.8h, v24.8h, v3.h[2]\n"
+      "ldr d24, [x17, #0xd0]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.8h, v25.8h, v0.h[3]\n"
+      "fmla v12.8h, v25.8h, v1.h[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v16.8h, v25.8h, v2.h[3]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v20.8h, v25.8h, v3.h[3]\n"
+      "ldr d25, [x17, #0xe0]\n"
+      "fmla v9.8h, v24.8h, v0.h[3]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.8h, v24.8h, v1.h[3]\n"
+      "ldr x21, [x17, #0x108]\n"
+      "fmla v17.8h, v24.8h, v2.h[3]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v21.8h, v24.8h, v3.h[3]\n"
+      "ldr d24, [x17, #0xf0]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.8h, v25.8h, v0.h[3]\n"
+      "fmla v14.8h, v25.8h, v1.h[3]\n"
+      "ldr x20, [x17, #0x118]\n"
+      "fmla v18.8h, v25.8h, v2.h[3]\n"
+      "fmla v22.8h, v25.8h, v3.h[3]\n"
+      "ldr d25, [x17, #0x100]\n"
+      "fmla v11.8h, v24.8h, v0.h[3]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.8h, v24.8h, v1.h[3]\n"
+      "ldr x21, [x17, #0x128]\n"
+      "fmla v19.8h, v24.8h, v2.h[3]\n"
+      "fmla v23.8h, v24.8h, v3.h[3]\n"
+      "ldr d24, [x17, #0x110]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.8h, v25.8h, v0.h[4]\n"
+      "fmla v12.8h, v25.8h, v1.h[4]\n"
+      "ldr x20, [x17, #0x138]\n"
+      "fmla v16.8h, v25.8h, v2.h[4]\n"
+      "fmla v20.8h, v25.8h, v3.h[4]\n"
+      "ldr d25, [x17, #0x120]\n"
+      "fmla v9.8h, v24.8h, v0.h[4]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.8h, v24.8h, v1.h[4]\n"
+      "ldr x21, [x17, #0x148]\n"
+      "fmla v17.8h, v24.8h, v2.h[4]\n"
+      "fmla v21.8h, v24.8h, v3.h[4]\n"
+      "ldr d24, [x17, #0x130]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.8h, v25.8h, v0.h[4]\n"
+      "fmla v14.8h, v25.8h, v1.h[4]\n"
+      "ldr x20, [x17, #0x158]\n"
+      "fmla v18.8h, v25.8h, v2.h[4]\n"
+      "fmla v22.8h, v25.8h, v3.h[4]\n"
+      "ldr d25, [x17, #0x140]\n"
+      "fmla v11.8h, v24.8h, v0.h[4]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.8h, v24.8h, v1.h[4]\n"
+      "ldr x21, [x17, #0x168]\n"
+      "fmla v19.8h, v24.8h, v2.h[4]\n"
+      "fmla v23.8h, v24.8h, v3.h[4]\n"
+      "ldr d24, [x17, #0x150]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.8h, v25.8h, v0.h[5]\n"
+      "fmla v12.8h, v25.8h, v1.h[5]\n"
+      "ldr x20, [x17, #0x178]\n"
+      "fmla v16.8h, v25.8h, v2.h[5]\n"
+      "fmla v20.8h, v25.8h, v3.h[5]\n"
+      "ldr d25, [x17, #0x160]\n"
+      "fmla v9.8h, v24.8h, v0.h[5]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.8h, v24.8h, v1.h[5]\n"
+      "ldr x21, [x17, #0x188]\n"
+      "fmla v17.8h, v24.8h, v2.h[5]\n"
+      "fmla v21.8h, v24.8h, v3.h[5]\n"
+      "ldr d24, [x17, #0x170]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.8h, v25.8h, v0.h[5]\n"
+      "fmla v14.8h, v25.8h, v1.h[5]\n"
+      "ldr x20, [x17, #0x198]\n"
+      "fmla v18.8h, v25.8h, v2.h[5]\n"
+      "fmla v22.8h, v25.8h, v3.h[5]\n"
+      "ldr d25, [x17, #0x180]\n"
+      "fmla v11.8h, v24.8h, v0.h[5]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.8h, v24.8h, v1.h[5]\n"
+      "ldr x21, [x17, #0x1a8]\n"
+      "fmla v19.8h, v24.8h, v2.h[5]\n"
+      "fmla v23.8h, v24.8h, v3.h[5]\n"
+      "ldr d24, [x17, #0x190]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.8h, v25.8h, v0.h[6]\n"
+      "fmla v12.8h, v25.8h, v1.h[6]\n"
+      "ldr x20, [x17, #0x1b8]\n"
+      "fmla v16.8h, v25.8h, v2.h[6]\n"
+      "fmla v20.8h, v25.8h, v3.h[6]\n"
+      "ldr d25, [x17, #0x1a0]\n"
+      "fmla v9.8h, v24.8h, v0.h[6]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.8h, v24.8h, v1.h[6]\n"
+      "ldr x21, [x17, #0x1c8]\n"
+      "fmla v17.8h, v24.8h, v2.h[6]\n"
+      "fmla v21.8h, v24.8h, v3.h[6]\n"
+      "ldr d24, [x17, #0x1b0]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.8h, v25.8h, v0.h[6]\n"
+      "fmla v14.8h, v25.8h, v1.h[6]\n"
+      "ldr x20, [x17, #0x1d8]\n"
+      "fmla v18.8h, v25.8h, v2.h[6]\n"
+      "fmla v22.8h, v25.8h, v3.h[6]\n"
+      "ldr d25, [x17, #0x1c0]\n"
+      "fmla v11.8h, v24.8h, v0.h[6]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.8h, v24.8h, v1.h[6]\n"
+      "ldr x21, [x17, #0x1e8]\n"
+      "fmla v19.8h, v24.8h, v2.h[6]\n"
+      "fmla v23.8h, v24.8h, v3.h[6]\n"
+      "ldr d24, [x17, #0x1d0]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.8h, v25.8h, v0.h[7]\n"
+      "fmla v12.8h, v25.8h, v1.h[7]\n"
+      "ldr x20, [x17, #0x1f8]\n"
+      "fmla v16.8h, v25.8h, v2.h[7]\n"
+      "fmla v20.8h, v25.8h, v3.h[7]\n"
+      "ldr d25, [x17, #0x1e0]\n"
+      "fmla v9.8h, v24.8h, v0.h[7]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.8h, v24.8h, v1.h[7]\n"
+      "fmla v17.8h, v24.8h, v2.h[7]\n"
+      "fmla v21.8h, v24.8h, v3.h[7]\n"
+      "ldr d24, [x17, #0x1f0]\n"
+      "mov v24.d[1], x20\n"
       "add x17, x17, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "ldr x12, [x17, #0x8]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "ldr x11, [x17, #0x18]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v10.8h, v25.8h, v0.h[7]\n"
+      "ldr x21, [x17, #0x8]\n"
+      "fmla v14.8h, v25.8h, v1.h[7]\n"
+      "ldr x20, [x17, #0x18]\n"
+      "fmla v18.8h, v25.8h, v2.h[7]\n"
+      "fmla v22.8h, v25.8h, v3.h[7]\n"
       "ldr d6, [x17, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v11.8h, v24.8h, v0.h[7]\n"
       "ldr d0, [x13, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "ldr d1, [x9, #0x0]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "ldr d2, [x27, #0x0]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
-      "ldr d3, [x25, #0x0]\n"
+      "fmla v15.8h, v24.8h, v1.h[7]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "fmla v19.8h, v24.8h, v2.h[7]\n"
+      "ldr d2, [x11, #0x0]\n"
+      "fmla v23.8h, v24.8h, v3.h[7]\n"
+      "ldr d3, [x10, #0x0]\n"
       "ldr d7, [x17, #0x10]\n"
-      "mov v6.d[1], x12\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
-      "mov v3.d[1], x24\n"
-      "mov v7.d[1], x11\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x25\n"
+      "mov v1.d[1], x24\n"
+      "mov v2.d[1], x23\n"
+      "mov v3.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 173b\n"
       "174:"  // Height 4: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "ldr q25, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
       "sub x14, x14, #0x8\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "ldr q6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "ldr q7, [x17, #0x1f0]\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "ldr q24, [x17, #0x30]\n"
+      "fmla v10.8h, v25.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v14.8h, v25.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v18.8h, v25.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v22.8h, v25.8h, v3.h[0]\n"
+      "ldr q25, [x17, #0x40]\n"
+      "fmla v11.8h, v24.8h, v0.h[0]\n"
+      "fmla v15.8h, v24.8h, v1.h[0]\n"
+      "fmla v19.8h, v24.8h, v2.h[0]\n"
+      "fmla v23.8h, v24.8h, v3.h[0]\n"
+      "ldr q24, [x17, #0x50]\n"
+      "fmla v8.8h, v25.8h, v0.h[1]\n"
+      "fmla v12.8h, v25.8h, v1.h[1]\n"
+      "fmla v16.8h, v25.8h, v2.h[1]\n"
+      "fmla v20.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x17, #0x60]\n"
+      "fmla v9.8h, v24.8h, v0.h[1]\n"
+      "fmla v13.8h, v24.8h, v1.h[1]\n"
+      "fmla v17.8h, v24.8h, v2.h[1]\n"
+      "fmla v21.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x17, #0x70]\n"
+      "fmla v10.8h, v25.8h, v0.h[1]\n"
+      "fmla v14.8h, v25.8h, v1.h[1]\n"
+      "fmla v18.8h, v25.8h, v2.h[1]\n"
+      "fmla v22.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x17, #0x80]\n"
+      "fmla v11.8h, v24.8h, v0.h[1]\n"
+      "fmla v15.8h, v24.8h, v1.h[1]\n"
+      "fmla v19.8h, v24.8h, v2.h[1]\n"
+      "fmla v23.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x17, #0x90]\n"
+      "fmla v8.8h, v25.8h, v0.h[2]\n"
+      "fmla v12.8h, v25.8h, v1.h[2]\n"
+      "fmla v16.8h, v25.8h, v2.h[2]\n"
+      "fmla v20.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x17, #0xa0]\n"
+      "fmla v9.8h, v24.8h, v0.h[2]\n"
+      "fmla v13.8h, v24.8h, v1.h[2]\n"
+      "fmla v17.8h, v24.8h, v2.h[2]\n"
+      "fmla v21.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x17, #0xb0]\n"
+      "fmla v10.8h, v25.8h, v0.h[2]\n"
+      "fmla v14.8h, v25.8h, v1.h[2]\n"
+      "fmla v18.8h, v25.8h, v2.h[2]\n"
+      "fmla v22.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x17, #0xc0]\n"
+      "fmla v11.8h, v24.8h, v0.h[2]\n"
+      "fmla v15.8h, v24.8h, v1.h[2]\n"
+      "fmla v19.8h, v24.8h, v2.h[2]\n"
+      "fmla v23.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x17, #0xd0]\n"
+      "fmla v8.8h, v25.8h, v0.h[3]\n"
+      "fmla v12.8h, v25.8h, v1.h[3]\n"
+      "fmla v16.8h, v25.8h, v2.h[3]\n"
+      "fmla v20.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x17, #0xe0]\n"
+      "fmla v9.8h, v24.8h, v0.h[3]\n"
+      "fmla v13.8h, v24.8h, v1.h[3]\n"
+      "fmla v17.8h, v24.8h, v2.h[3]\n"
+      "fmla v21.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x17, #0xf0]\n"
+      "fmla v10.8h, v25.8h, v0.h[3]\n"
+      "fmla v14.8h, v25.8h, v1.h[3]\n"
+      "fmla v18.8h, v25.8h, v2.h[3]\n"
+      "fmla v22.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x17, #0x100]\n"
+      "fmla v11.8h, v24.8h, v0.h[3]\n"
+      "fmla v15.8h, v24.8h, v1.h[3]\n"
+      "fmla v19.8h, v24.8h, v2.h[3]\n"
+      "fmla v23.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x17, #0x110]\n"
+      "fmla v8.8h, v25.8h, v0.h[4]\n"
+      "fmla v12.8h, v25.8h, v1.h[4]\n"
+      "fmla v16.8h, v25.8h, v2.h[4]\n"
+      "fmla v20.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x17, #0x120]\n"
+      "fmla v9.8h, v24.8h, v0.h[4]\n"
+      "fmla v13.8h, v24.8h, v1.h[4]\n"
+      "fmla v17.8h, v24.8h, v2.h[4]\n"
+      "fmla v21.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x17, #0x130]\n"
+      "fmla v10.8h, v25.8h, v0.h[4]\n"
+      "fmla v14.8h, v25.8h, v1.h[4]\n"
+      "fmla v18.8h, v25.8h, v2.h[4]\n"
+      "fmla v22.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x17, #0x140]\n"
+      "fmla v11.8h, v24.8h, v0.h[4]\n"
+      "fmla v15.8h, v24.8h, v1.h[4]\n"
+      "fmla v19.8h, v24.8h, v2.h[4]\n"
+      "fmla v23.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x17, #0x150]\n"
+      "fmla v8.8h, v25.8h, v0.h[5]\n"
+      "fmla v12.8h, v25.8h, v1.h[5]\n"
+      "fmla v16.8h, v25.8h, v2.h[5]\n"
+      "fmla v20.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x17, #0x160]\n"
+      "fmla v9.8h, v24.8h, v0.h[5]\n"
+      "fmla v13.8h, v24.8h, v1.h[5]\n"
+      "fmla v17.8h, v24.8h, v2.h[5]\n"
+      "fmla v21.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x17, #0x170]\n"
+      "fmla v10.8h, v25.8h, v0.h[5]\n"
+      "fmla v14.8h, v25.8h, v1.h[5]\n"
+      "fmla v18.8h, v25.8h, v2.h[5]\n"
+      "fmla v22.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x17, #0x180]\n"
+      "fmla v11.8h, v24.8h, v0.h[5]\n"
+      "fmla v15.8h, v24.8h, v1.h[5]\n"
+      "fmla v19.8h, v24.8h, v2.h[5]\n"
+      "fmla v23.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x17, #0x190]\n"
+      "fmla v8.8h, v25.8h, v0.h[6]\n"
+      "fmla v12.8h, v25.8h, v1.h[6]\n"
+      "fmla v16.8h, v25.8h, v2.h[6]\n"
+      "fmla v20.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x17, #0x1a0]\n"
+      "fmla v9.8h, v24.8h, v0.h[6]\n"
+      "fmla v13.8h, v24.8h, v1.h[6]\n"
+      "fmla v17.8h, v24.8h, v2.h[6]\n"
+      "fmla v21.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x17, #0x1b0]\n"
+      "fmla v10.8h, v25.8h, v0.h[6]\n"
+      "fmla v14.8h, v25.8h, v1.h[6]\n"
+      "fmla v18.8h, v25.8h, v2.h[6]\n"
+      "fmla v22.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x17, #0x1c0]\n"
+      "fmla v11.8h, v24.8h, v0.h[6]\n"
+      "fmla v15.8h, v24.8h, v1.h[6]\n"
+      "fmla v19.8h, v24.8h, v2.h[6]\n"
+      "fmla v23.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x17, #0x1d0]\n"
+      "fmla v8.8h, v25.8h, v0.h[7]\n"
+      "fmla v12.8h, v25.8h, v1.h[7]\n"
+      "fmla v16.8h, v25.8h, v2.h[7]\n"
+      "fmla v20.8h, v25.8h, v3.h[7]\n"
+      "ldr q25, [x17, #0x1e0]\n"
+      "fmla v9.8h, v24.8h, v0.h[7]\n"
+      "fmla v13.8h, v24.8h, v1.h[7]\n"
+      "fmla v17.8h, v24.8h, v2.h[7]\n"
+      "fmla v21.8h, v24.8h, v3.h[7]\n"
+      "ldr q24, [x17, #0x1f0]\n"
+      "fmla v10.8h, v25.8h, v0.h[7]\n"
       "add x17, x17, #0x200\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v14.8h, v25.8h, v1.h[7]\n"
+      "fmla v18.8h, v25.8h, v2.h[7]\n"
+      "fmla v22.8h, v25.8h, v3.h[7]\n"
+      "fmla v11.8h, v24.8h, v0.h[7]\n"
+      "fmla v15.8h, v24.8h, v1.h[7]\n"
+      "fmla v19.8h, v24.8h, v2.h[7]\n"
+      "fmla v23.8h, v24.8h, v3.h[7]\n"
       "175:"  // Height 4: Multiply loop: Main loop skip
       "cbz x14, 177f\n"
       "176:"  // Height 4: Multiply loop: Odd block loop
-      "ldr h0, [x13], #0x2\n"
+      "ldr h3, [x13], #0x2\n"
       "sub x14, x14, #0x1\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
-      "ldr h3, [x25], #0x2\n"
-      "ldr q6, [x17, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr h2, [x12], #0x2\n"
+      "ldr h1, [x11], #0x2\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr q25, [x17, #0x0]\n"
+      "fmla v8.8h, v25.8h, v3.h[0]\n"
+      "ldr q24, [x17, #0x10]\n"
+      "fmla v12.8h, v25.8h, v2.h[0]\n"
+      "fmla v16.8h, v25.8h, v1.h[0]\n"
+      "fmla v20.8h, v25.8h, v0.h[0]\n"
+      "ldr q25, [x17, #0x20]\n"
+      "fmla v9.8h, v24.8h, v3.h[0]\n"
+      "fmla v13.8h, v24.8h, v2.h[0]\n"
+      "fmla v17.8h, v24.8h, v1.h[0]\n"
+      "fmla v21.8h, v24.8h, v0.h[0]\n"
+      "ldr q24, [x17, #0x30]\n"
+      "fmla v10.8h, v25.8h, v3.h[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v14.8h, v25.8h, v2.h[0]\n"
+      "fmla v18.8h, v25.8h, v1.h[0]\n"
+      "fmla v22.8h, v25.8h, v0.h[0]\n"
+      "fmla v11.8h, v24.8h, v3.h[0]\n"
+      "fmla v15.8h, v24.8h, v2.h[0]\n"
+      "fmla v19.8h, v24.8h, v1.h[0]\n"
+      "fmla v23.8h, v24.8h, v0.h[0]\n"
       "cbnz x14, 176b\n"
       "177:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2918,41 +2918,41 @@
       "prfm pstl1keep, [x23, #0x0]\n"
       "tbz %x[flags], #1, 178f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmin v12.8h, v12.8h, v0.8h\n"
-      "fmin v13.8h, v13.8h, v0.8h\n"
-      "fmin v14.8h, v14.8h, v0.8h\n"
-      "fmin v15.8h, v15.8h, v0.8h\n"
-      "fmin v16.8h, v16.8h, v0.8h\n"
-      "fmin v17.8h, v17.8h, v0.8h\n"
-      "fmin v18.8h, v18.8h, v0.8h\n"
-      "fmin v19.8h, v19.8h, v0.8h\n"
-      "fmin v20.8h, v20.8h, v0.8h\n"
-      "fmin v21.8h, v21.8h, v0.8h\n"
-      "fmin v22.8h, v22.8h, v0.8h\n"
-      "fmin v23.8h, v23.8h, v0.8h\n"
+      "ld1r { v24.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v24.8h\n"
+      "fmin v9.8h, v9.8h, v24.8h\n"
+      "fmin v10.8h, v10.8h, v24.8h\n"
+      "fmin v11.8h, v11.8h, v24.8h\n"
+      "fmin v12.8h, v12.8h, v24.8h\n"
+      "fmin v13.8h, v13.8h, v24.8h\n"
+      "fmin v14.8h, v14.8h, v24.8h\n"
+      "fmin v15.8h, v15.8h, v24.8h\n"
+      "fmin v16.8h, v16.8h, v24.8h\n"
+      "fmin v17.8h, v17.8h, v24.8h\n"
+      "fmin v18.8h, v18.8h, v24.8h\n"
+      "fmin v19.8h, v19.8h, v24.8h\n"
+      "fmin v20.8h, v20.8h, v24.8h\n"
+      "fmin v21.8h, v21.8h, v24.8h\n"
+      "fmin v22.8h, v22.8h, v24.8h\n"
+      "fmin v23.8h, v23.8h, v24.8h\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v0.8h\n"
-      "fmax v13.8h, v13.8h, v0.8h\n"
-      "fmax v14.8h, v14.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v0.8h\n"
-      "fmax v16.8h, v16.8h, v0.8h\n"
-      "fmax v17.8h, v17.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v0.8h\n"
-      "fmax v19.8h, v19.8h, v0.8h\n"
-      "fmax v20.8h, v20.8h, v0.8h\n"
-      "fmax v21.8h, v21.8h, v0.8h\n"
-      "fmax v22.8h, v22.8h, v0.8h\n"
-      "fmax v23.8h, v23.8h, v0.8h\n"
+      "ld1r { v24.8h }, [x20]\n"
+      "fmax v8.8h, v8.8h, v24.8h\n"
+      "fmax v9.8h, v9.8h, v24.8h\n"
+      "fmax v10.8h, v10.8h, v24.8h\n"
+      "fmax v11.8h, v11.8h, v24.8h\n"
+      "fmax v12.8h, v12.8h, v24.8h\n"
+      "fmax v13.8h, v13.8h, v24.8h\n"
+      "fmax v14.8h, v14.8h, v24.8h\n"
+      "fmax v15.8h, v15.8h, v24.8h\n"
+      "fmax v16.8h, v16.8h, v24.8h\n"
+      "fmax v17.8h, v17.8h, v24.8h\n"
+      "fmax v18.8h, v18.8h, v24.8h\n"
+      "fmax v19.8h, v19.8h, v24.8h\n"
+      "fmax v20.8h, v20.8h, v24.8h\n"
+      "fmax v21.8h, v21.8h, v24.8h\n"
+      "fmax v22.8h, v22.8h, v24.8h\n"
+      "fmax v23.8h, v23.8h, v24.8h\n"
       "178:"  // Height 4: No activation
       "cmp x8, #0x20\n"
       "bge 195f\n"
@@ -3493,564 +3493,564 @@
       "219:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 220f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x23, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
       "cbnz x15, 221f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20, LSL #1\n"
+      "add x12, x12, x20, LSL #1\n"
+      "add x11, x11, x20, LSL #1\n"
+      "add x10, x10, x20, LSL #1\n"
       "add x9, x9, x20, LSL #1\n"
-      "add x27, x27, x20, LSL #1\n"
-      "add x25, x25, x20, LSL #1\n"
-      "add x23, x23, x20, LSL #1\n"
       "b 221f\n"
       "220:"  // Height 5: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20, LSL #1\n"
-      "add x27, x9, x20, LSL #1\n"
-      "add x25, x27, x20, LSL #1\n"
-      "add x23, x25, x20, LSL #1\n"
+      "add x12, x13, x21, LSL #1\n"
+      "add x11, x12, x21, LSL #1\n"
+      "add x10, x11, x21, LSL #1\n"
+      "add x9, x10, x21, LSL #1\n"
       "221:"  // Height 5: input setup done
       "cmp x14, #0x8\n"
       "blt 224f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x10\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x23, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
       "ldr q7, [x17, #0x10]\n"
       "blt 223f\n"
       "222:"  // Height 5: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr x12, [x17, #0x28]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "ldr d6, [x17, #0x20]\n"
+      "ldr d29, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v29.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr x12, [x17, #0x48]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x58]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "add x23, x23, #0x10\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "ldr x10, [x13, #0x8]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr x12, [x17, #0x68]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "ldr x28, [x9, #0x8]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "ldr x26, [x27, #0x8]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "ldr x24, [x25, #0x8]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "ldr x22, [x23, #0x8]\n"
-      "fmla v24.8h, v6.8h, v4.h[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr x12, [x17, #0x88]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "ldr d28, [x17, #0x30]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.8h, v29.8h, v0.h[0]\n"
+      "fmla v14.8h, v29.8h, v1.h[0]\n"
+      "ldr x20, [x17, #0x58]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v22.8h, v29.8h, v3.h[0]\n"
+      "ldr x26, [x13, #0x8]\n"
+      "fmla v26.8h, v29.8h, v4.h[0]\n"
+      "ldr d29, [x17, #0x40]\n"
+      "fmla v11.8h, v28.8h, v0.h[0]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.8h, v28.8h, v1.h[0]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "ldr x25, [x12, #0x8]\n"
+      "fmla v23.8h, v28.8h, v3.h[0]\n"
+      "ldr x24, [x11, #0x8]\n"
+      "fmla v27.8h, v28.8h, v4.h[0]\n"
+      "ldr d28, [x17, #0x50]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.8h, v29.8h, v0.h[1]\n"
+      "fmla v12.8h, v29.8h, v1.h[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v16.8h, v29.8h, v2.h[1]\n"
+      "ldr x23, [x10, #0x8]\n"
+      "fmla v20.8h, v29.8h, v3.h[1]\n"
+      "ldr x22, [x9, #0x8]\n"
+      "fmla v24.8h, v29.8h, v4.h[1]\n"
+      "ldr d29, [x17, #0x60]\n"
+      "fmla v9.8h, v28.8h, v0.h[1]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.8h, v28.8h, v1.h[1]\n"
+      "ldr x21, [x17, #0x88]\n"
+      "fmla v17.8h, v28.8h, v2.h[1]\n"
       "sub x14, x14, #0x8\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "fmla v21.8h, v28.8h, v3.h[1]\n"
       "cmp x14, #0x10\n"
-      "fmla v25.8h, v7.8h, v4.h[1]\n"
-      "ldr d7, [x17, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v25.8h, v28.8h, v4.h[1]\n"
+      "ldr d28, [x17, #0x70]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.8h, v29.8h, v0.h[1]\n"
+      "fmla v14.8h, v29.8h, v1.h[1]\n"
+      "ldr x20, [x17, #0x98]\n"
+      "fmla v18.8h, v29.8h, v2.h[1]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "fmla v22.8h, v29.8h, v3.h[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v26.8h, v29.8h, v4.h[1]\n"
+      "ldr d29, [x17, #0x80]\n"
+      "fmla v11.8h, v28.8h, v0.h[1]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.8h, v28.8h, v1.h[1]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v19.8h, v28.8h, v2.h[1]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v23.8h, v28.8h, v3.h[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v27.8h, v28.8h, v4.h[1]\n"
+      "ldr d28, [x17, #0x90]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.8h, v29.8h, v0.h[2]\n"
+      "fmla v12.8h, v29.8h, v1.h[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v16.8h, v29.8h, v2.h[2]\n"
       "prfm pldl1keep, [x9, #0x80]\n"
-      "fmla v26.8h, v6.8h, v4.h[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr x12, [x17, #0xa8]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v27.8h, v7.8h, v4.h[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "fmla v24.8h, v6.8h, v4.h[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "ldr x12, [x17, #0xc8]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "fmla v25.8h, v7.8h, v4.h[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "fmla v26.8h, v6.8h, v4.h[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "ldr x12, [x17, #0xe8]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "fmla v27.8h, v7.8h, v4.h[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "fmla v24.8h, v6.8h, v4.h[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "ldr x12, [x17, #0x108]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "fmla v25.8h, v7.8h, v4.h[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "ldr x11, [x17, #0x118]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "fmla v26.8h, v6.8h, v4.h[3]\n"
-      "ldr d6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "ldr x12, [x17, #0x128]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "fmla v27.8h, v7.8h, v4.h[3]\n"
-      "ldr d7, [x17, #0x110]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "ldr x11, [x17, #0x138]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "fmla v24.8h, v6.8h, v4.h[4]\n"
-      "ldr d6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "ldr x12, [x17, #0x148]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "fmla v25.8h, v7.8h, v4.h[4]\n"
-      "ldr d7, [x17, #0x130]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "ldr x11, [x17, #0x158]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "fmla v26.8h, v6.8h, v4.h[4]\n"
-      "ldr d6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "ldr x12, [x17, #0x168]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "fmla v27.8h, v7.8h, v4.h[4]\n"
-      "ldr d7, [x17, #0x150]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "ldr x11, [x17, #0x178]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "fmla v24.8h, v6.8h, v4.h[5]\n"
-      "ldr d6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "ldr x12, [x17, #0x188]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "fmla v25.8h, v7.8h, v4.h[5]\n"
-      "ldr d7, [x17, #0x170]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "ldr x11, [x17, #0x198]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "fmla v26.8h, v6.8h, v4.h[5]\n"
-      "ldr d6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "ldr x12, [x17, #0x1a8]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "fmla v27.8h, v7.8h, v4.h[5]\n"
-      "ldr d7, [x17, #0x190]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "ldr x11, [x17, #0x1b8]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "fmla v24.8h, v6.8h, v4.h[6]\n"
-      "ldr d6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "ldr x12, [x17, #0x1c8]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "fmla v25.8h, v7.8h, v4.h[6]\n"
-      "ldr d7, [x17, #0x1b0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "ldr x11, [x17, #0x1d8]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "fmla v26.8h, v6.8h, v4.h[6]\n"
-      "ldr d6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "ldr x12, [x17, #0x1e8]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "fmla v27.8h, v7.8h, v4.h[6]\n"
-      "ldr d7, [x17, #0x1d0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "ldr x11, [x17, #0x1f8]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "fmla v24.8h, v6.8h, v4.h[7]\n"
-      "ldr d6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "fmla v25.8h, v7.8h, v4.h[7]\n"
-      "ldr d7, [x17, #0x1f0]\n"
-      "mov v7.d[1], x11\n"
+      "fmla v20.8h, v29.8h, v3.h[2]\n"
+      "fmla v24.8h, v29.8h, v4.h[2]\n"
+      "ldr d29, [x17, #0xa0]\n"
+      "fmla v9.8h, v28.8h, v0.h[2]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.8h, v28.8h, v1.h[2]\n"
+      "ldr x21, [x17, #0xc8]\n"
+      "fmla v17.8h, v28.8h, v2.h[2]\n"
+      "fmla v21.8h, v28.8h, v3.h[2]\n"
+      "fmla v25.8h, v28.8h, v4.h[2]\n"
+      "ldr d28, [x17, #0xb0]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.8h, v29.8h, v0.h[2]\n"
+      "fmla v14.8h, v29.8h, v1.h[2]\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "fmla v18.8h, v29.8h, v2.h[2]\n"
+      "fmla v22.8h, v29.8h, v3.h[2]\n"
+      "fmla v26.8h, v29.8h, v4.h[2]\n"
+      "ldr d29, [x17, #0xc0]\n"
+      "fmla v11.8h, v28.8h, v0.h[2]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.8h, v28.8h, v1.h[2]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v19.8h, v28.8h, v2.h[2]\n"
+      "fmla v23.8h, v28.8h, v3.h[2]\n"
+      "fmla v27.8h, v28.8h, v4.h[2]\n"
+      "ldr d28, [x17, #0xd0]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.8h, v29.8h, v0.h[3]\n"
+      "fmla v12.8h, v29.8h, v1.h[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v16.8h, v29.8h, v2.h[3]\n"
+      "fmla v20.8h, v29.8h, v3.h[3]\n"
+      "fmla v24.8h, v29.8h, v4.h[3]\n"
+      "ldr d29, [x17, #0xe0]\n"
+      "fmla v9.8h, v28.8h, v0.h[3]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.8h, v28.8h, v1.h[3]\n"
+      "ldr x21, [x17, #0x108]\n"
+      "fmla v17.8h, v28.8h, v2.h[3]\n"
+      "fmla v21.8h, v28.8h, v3.h[3]\n"
+      "fmla v25.8h, v28.8h, v4.h[3]\n"
+      "ldr d28, [x17, #0xf0]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.8h, v29.8h, v0.h[3]\n"
+      "fmla v14.8h, v29.8h, v1.h[3]\n"
+      "ldr x20, [x17, #0x118]\n"
+      "fmla v18.8h, v29.8h, v2.h[3]\n"
+      "fmla v22.8h, v29.8h, v3.h[3]\n"
+      "fmla v26.8h, v29.8h, v4.h[3]\n"
+      "ldr d29, [x17, #0x100]\n"
+      "fmla v11.8h, v28.8h, v0.h[3]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.8h, v28.8h, v1.h[3]\n"
+      "ldr x21, [x17, #0x128]\n"
+      "fmla v19.8h, v28.8h, v2.h[3]\n"
+      "fmla v23.8h, v28.8h, v3.h[3]\n"
+      "fmla v27.8h, v28.8h, v4.h[3]\n"
+      "ldr d28, [x17, #0x110]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.8h, v29.8h, v0.h[4]\n"
+      "fmla v12.8h, v29.8h, v1.h[4]\n"
+      "ldr x20, [x17, #0x138]\n"
+      "fmla v16.8h, v29.8h, v2.h[4]\n"
+      "fmla v20.8h, v29.8h, v3.h[4]\n"
+      "fmla v24.8h, v29.8h, v4.h[4]\n"
+      "ldr d29, [x17, #0x120]\n"
+      "fmla v9.8h, v28.8h, v0.h[4]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.8h, v28.8h, v1.h[4]\n"
+      "ldr x21, [x17, #0x148]\n"
+      "fmla v17.8h, v28.8h, v2.h[4]\n"
+      "fmla v21.8h, v28.8h, v3.h[4]\n"
+      "fmla v25.8h, v28.8h, v4.h[4]\n"
+      "ldr d28, [x17, #0x130]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.8h, v29.8h, v0.h[4]\n"
+      "fmla v14.8h, v29.8h, v1.h[4]\n"
+      "ldr x20, [x17, #0x158]\n"
+      "fmla v18.8h, v29.8h, v2.h[4]\n"
+      "fmla v22.8h, v29.8h, v3.h[4]\n"
+      "fmla v26.8h, v29.8h, v4.h[4]\n"
+      "ldr d29, [x17, #0x140]\n"
+      "fmla v11.8h, v28.8h, v0.h[4]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.8h, v28.8h, v1.h[4]\n"
+      "ldr x21, [x17, #0x168]\n"
+      "fmla v19.8h, v28.8h, v2.h[4]\n"
+      "fmla v23.8h, v28.8h, v3.h[4]\n"
+      "fmla v27.8h, v28.8h, v4.h[4]\n"
+      "ldr d28, [x17, #0x150]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.8h, v29.8h, v0.h[5]\n"
+      "fmla v12.8h, v29.8h, v1.h[5]\n"
+      "ldr x20, [x17, #0x178]\n"
+      "fmla v16.8h, v29.8h, v2.h[5]\n"
+      "fmla v20.8h, v29.8h, v3.h[5]\n"
+      "fmla v24.8h, v29.8h, v4.h[5]\n"
+      "ldr d29, [x17, #0x160]\n"
+      "fmla v9.8h, v28.8h, v0.h[5]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.8h, v28.8h, v1.h[5]\n"
+      "ldr x21, [x17, #0x188]\n"
+      "fmla v17.8h, v28.8h, v2.h[5]\n"
+      "fmla v21.8h, v28.8h, v3.h[5]\n"
+      "fmla v25.8h, v28.8h, v4.h[5]\n"
+      "ldr d28, [x17, #0x170]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.8h, v29.8h, v0.h[5]\n"
+      "fmla v14.8h, v29.8h, v1.h[5]\n"
+      "ldr x20, [x17, #0x198]\n"
+      "fmla v18.8h, v29.8h, v2.h[5]\n"
+      "fmla v22.8h, v29.8h, v3.h[5]\n"
+      "fmla v26.8h, v29.8h, v4.h[5]\n"
+      "ldr d29, [x17, #0x180]\n"
+      "fmla v11.8h, v28.8h, v0.h[5]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.8h, v28.8h, v1.h[5]\n"
+      "ldr x21, [x17, #0x1a8]\n"
+      "fmla v19.8h, v28.8h, v2.h[5]\n"
+      "fmla v23.8h, v28.8h, v3.h[5]\n"
+      "fmla v27.8h, v28.8h, v4.h[5]\n"
+      "ldr d28, [x17, #0x190]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.8h, v29.8h, v0.h[6]\n"
+      "fmla v12.8h, v29.8h, v1.h[6]\n"
+      "ldr x20, [x17, #0x1b8]\n"
+      "fmla v16.8h, v29.8h, v2.h[6]\n"
+      "fmla v20.8h, v29.8h, v3.h[6]\n"
+      "fmla v24.8h, v29.8h, v4.h[6]\n"
+      "ldr d29, [x17, #0x1a0]\n"
+      "fmla v9.8h, v28.8h, v0.h[6]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.8h, v28.8h, v1.h[6]\n"
+      "ldr x21, [x17, #0x1c8]\n"
+      "fmla v17.8h, v28.8h, v2.h[6]\n"
+      "fmla v21.8h, v28.8h, v3.h[6]\n"
+      "fmla v25.8h, v28.8h, v4.h[6]\n"
+      "ldr d28, [x17, #0x1b0]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.8h, v29.8h, v0.h[6]\n"
+      "fmla v14.8h, v29.8h, v1.h[6]\n"
+      "ldr x20, [x17, #0x1d8]\n"
+      "fmla v18.8h, v29.8h, v2.h[6]\n"
+      "fmla v22.8h, v29.8h, v3.h[6]\n"
+      "fmla v26.8h, v29.8h, v4.h[6]\n"
+      "ldr d29, [x17, #0x1c0]\n"
+      "fmla v11.8h, v28.8h, v0.h[6]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.8h, v28.8h, v1.h[6]\n"
+      "ldr x21, [x17, #0x1e8]\n"
+      "fmla v19.8h, v28.8h, v2.h[6]\n"
+      "fmla v23.8h, v28.8h, v3.h[6]\n"
+      "fmla v27.8h, v28.8h, v4.h[6]\n"
+      "ldr d28, [x17, #0x1d0]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.8h, v29.8h, v0.h[7]\n"
+      "fmla v12.8h, v29.8h, v1.h[7]\n"
+      "ldr x20, [x17, #0x1f8]\n"
+      "fmla v16.8h, v29.8h, v2.h[7]\n"
+      "fmla v20.8h, v29.8h, v3.h[7]\n"
+      "fmla v24.8h, v29.8h, v4.h[7]\n"
+      "ldr d29, [x17, #0x1e0]\n"
+      "fmla v9.8h, v28.8h, v0.h[7]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.8h, v28.8h, v1.h[7]\n"
+      "fmla v17.8h, v28.8h, v2.h[7]\n"
+      "fmla v21.8h, v28.8h, v3.h[7]\n"
+      "fmla v25.8h, v28.8h, v4.h[7]\n"
+      "ldr d28, [x17, #0x1f0]\n"
+      "mov v28.d[1], x20\n"
       "add x17, x17, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "ldr x12, [x17, #0x8]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "ldr x11, [x17, #0x18]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v10.8h, v29.8h, v0.h[7]\n"
+      "ldr x21, [x17, #0x8]\n"
+      "fmla v14.8h, v29.8h, v1.h[7]\n"
+      "ldr x20, [x17, #0x18]\n"
+      "fmla v18.8h, v29.8h, v2.h[7]\n"
+      "fmla v22.8h, v29.8h, v3.h[7]\n"
+      "fmla v26.8h, v29.8h, v4.h[7]\n"
       "ldr d6, [x17, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v11.8h, v28.8h, v0.h[7]\n"
       "ldr d0, [x13, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "ldr d1, [x9, #0x0]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "ldr d2, [x27, #0x0]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
-      "ldr d3, [x25, #0x0]\n"
-      "fmla v27.8h, v7.8h, v4.h[7]\n"
-      "ldr d4, [x23, #0x0]\n"
+      "fmla v15.8h, v28.8h, v1.h[7]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "fmla v19.8h, v28.8h, v2.h[7]\n"
+      "ldr d2, [x11, #0x0]\n"
+      "fmla v23.8h, v28.8h, v3.h[7]\n"
+      "ldr d3, [x10, #0x0]\n"
+      "fmla v27.8h, v28.8h, v4.h[7]\n"
+      "ldr d4, [x9, #0x0]\n"
       "ldr d7, [x17, #0x10]\n"
-      "mov v6.d[1], x12\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
-      "mov v3.d[1], x24\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x26\n"
+      "mov v1.d[1], x25\n"
+      "mov v2.d[1], x24\n"
+      "mov v3.d[1], x23\n"
       "mov v4.d[1], x22\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "bge 222b\n"
       "223:"  // Height 5: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "ldr q29, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
       "sub x14, x14, #0x8\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "fmla v24.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "fmla v25.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "fmla v26.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "fmla v27.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "fmla v24.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "fmla v25.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "fmla v26.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "fmla v27.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "fmla v24.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "fmla v25.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "fmla v26.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x17, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "fmla v27.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x17, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "fmla v24.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x17, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "fmla v25.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x17, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "fmla v26.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x17, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "fmla v27.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x17, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "fmla v24.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x17, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "fmla v25.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x17, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "fmla v26.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x17, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "fmla v27.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x17, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "fmla v24.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x17, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "fmla v25.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x17, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "fmla v26.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x17, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "fmla v27.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x17, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "fmla v24.8h, v6.8h, v4.h[7]\n"
-      "ldr q6, [x17, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "fmla v25.8h, v7.8h, v4.h[7]\n"
-      "ldr q7, [x17, #0x1f0]\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "ldr q28, [x17, #0x30]\n"
+      "fmla v10.8h, v29.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v14.8h, v29.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "fmla v22.8h, v29.8h, v3.h[0]\n"
+      "fmla v26.8h, v29.8h, v4.h[0]\n"
+      "ldr q29, [x17, #0x40]\n"
+      "fmla v11.8h, v28.8h, v0.h[0]\n"
+      "fmla v15.8h, v28.8h, v1.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v3.h[0]\n"
+      "fmla v27.8h, v28.8h, v4.h[0]\n"
+      "ldr q28, [x17, #0x50]\n"
+      "fmla v8.8h, v29.8h, v0.h[1]\n"
+      "fmla v12.8h, v29.8h, v1.h[1]\n"
+      "fmla v16.8h, v29.8h, v2.h[1]\n"
+      "fmla v20.8h, v29.8h, v3.h[1]\n"
+      "fmla v24.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x17, #0x60]\n"
+      "fmla v9.8h, v28.8h, v0.h[1]\n"
+      "fmla v13.8h, v28.8h, v1.h[1]\n"
+      "fmla v17.8h, v28.8h, v2.h[1]\n"
+      "fmla v21.8h, v28.8h, v3.h[1]\n"
+      "fmla v25.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x17, #0x70]\n"
+      "fmla v10.8h, v29.8h, v0.h[1]\n"
+      "fmla v14.8h, v29.8h, v1.h[1]\n"
+      "fmla v18.8h, v29.8h, v2.h[1]\n"
+      "fmla v22.8h, v29.8h, v3.h[1]\n"
+      "fmla v26.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x17, #0x80]\n"
+      "fmla v11.8h, v28.8h, v0.h[1]\n"
+      "fmla v15.8h, v28.8h, v1.h[1]\n"
+      "fmla v19.8h, v28.8h, v2.h[1]\n"
+      "fmla v23.8h, v28.8h, v3.h[1]\n"
+      "fmla v27.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x17, #0x90]\n"
+      "fmla v8.8h, v29.8h, v0.h[2]\n"
+      "fmla v12.8h, v29.8h, v1.h[2]\n"
+      "fmla v16.8h, v29.8h, v2.h[2]\n"
+      "fmla v20.8h, v29.8h, v3.h[2]\n"
+      "fmla v24.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x17, #0xa0]\n"
+      "fmla v9.8h, v28.8h, v0.h[2]\n"
+      "fmla v13.8h, v28.8h, v1.h[2]\n"
+      "fmla v17.8h, v28.8h, v2.h[2]\n"
+      "fmla v21.8h, v28.8h, v3.h[2]\n"
+      "fmla v25.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x17, #0xb0]\n"
+      "fmla v10.8h, v29.8h, v0.h[2]\n"
+      "fmla v14.8h, v29.8h, v1.h[2]\n"
+      "fmla v18.8h, v29.8h, v2.h[2]\n"
+      "fmla v22.8h, v29.8h, v3.h[2]\n"
+      "fmla v26.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x17, #0xc0]\n"
+      "fmla v11.8h, v28.8h, v0.h[2]\n"
+      "fmla v15.8h, v28.8h, v1.h[2]\n"
+      "fmla v19.8h, v28.8h, v2.h[2]\n"
+      "fmla v23.8h, v28.8h, v3.h[2]\n"
+      "fmla v27.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x17, #0xd0]\n"
+      "fmla v8.8h, v29.8h, v0.h[3]\n"
+      "fmla v12.8h, v29.8h, v1.h[3]\n"
+      "fmla v16.8h, v29.8h, v2.h[3]\n"
+      "fmla v20.8h, v29.8h, v3.h[3]\n"
+      "fmla v24.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x17, #0xe0]\n"
+      "fmla v9.8h, v28.8h, v0.h[3]\n"
+      "fmla v13.8h, v28.8h, v1.h[3]\n"
+      "fmla v17.8h, v28.8h, v2.h[3]\n"
+      "fmla v21.8h, v28.8h, v3.h[3]\n"
+      "fmla v25.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x17, #0xf0]\n"
+      "fmla v10.8h, v29.8h, v0.h[3]\n"
+      "fmla v14.8h, v29.8h, v1.h[3]\n"
+      "fmla v18.8h, v29.8h, v2.h[3]\n"
+      "fmla v22.8h, v29.8h, v3.h[3]\n"
+      "fmla v26.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x17, #0x100]\n"
+      "fmla v11.8h, v28.8h, v0.h[3]\n"
+      "fmla v15.8h, v28.8h, v1.h[3]\n"
+      "fmla v19.8h, v28.8h, v2.h[3]\n"
+      "fmla v23.8h, v28.8h, v3.h[3]\n"
+      "fmla v27.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x17, #0x110]\n"
+      "fmla v8.8h, v29.8h, v0.h[4]\n"
+      "fmla v12.8h, v29.8h, v1.h[4]\n"
+      "fmla v16.8h, v29.8h, v2.h[4]\n"
+      "fmla v20.8h, v29.8h, v3.h[4]\n"
+      "fmla v24.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x17, #0x120]\n"
+      "fmla v9.8h, v28.8h, v0.h[4]\n"
+      "fmla v13.8h, v28.8h, v1.h[4]\n"
+      "fmla v17.8h, v28.8h, v2.h[4]\n"
+      "fmla v21.8h, v28.8h, v3.h[4]\n"
+      "fmla v25.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x17, #0x130]\n"
+      "fmla v10.8h, v29.8h, v0.h[4]\n"
+      "fmla v14.8h, v29.8h, v1.h[4]\n"
+      "fmla v18.8h, v29.8h, v2.h[4]\n"
+      "fmla v22.8h, v29.8h, v3.h[4]\n"
+      "fmla v26.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x17, #0x140]\n"
+      "fmla v11.8h, v28.8h, v0.h[4]\n"
+      "fmla v15.8h, v28.8h, v1.h[4]\n"
+      "fmla v19.8h, v28.8h, v2.h[4]\n"
+      "fmla v23.8h, v28.8h, v3.h[4]\n"
+      "fmla v27.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x17, #0x150]\n"
+      "fmla v8.8h, v29.8h, v0.h[5]\n"
+      "fmla v12.8h, v29.8h, v1.h[5]\n"
+      "fmla v16.8h, v29.8h, v2.h[5]\n"
+      "fmla v20.8h, v29.8h, v3.h[5]\n"
+      "fmla v24.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x17, #0x160]\n"
+      "fmla v9.8h, v28.8h, v0.h[5]\n"
+      "fmla v13.8h, v28.8h, v1.h[5]\n"
+      "fmla v17.8h, v28.8h, v2.h[5]\n"
+      "fmla v21.8h, v28.8h, v3.h[5]\n"
+      "fmla v25.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x17, #0x170]\n"
+      "fmla v10.8h, v29.8h, v0.h[5]\n"
+      "fmla v14.8h, v29.8h, v1.h[5]\n"
+      "fmla v18.8h, v29.8h, v2.h[5]\n"
+      "fmla v22.8h, v29.8h, v3.h[5]\n"
+      "fmla v26.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x17, #0x180]\n"
+      "fmla v11.8h, v28.8h, v0.h[5]\n"
+      "fmla v15.8h, v28.8h, v1.h[5]\n"
+      "fmla v19.8h, v28.8h, v2.h[5]\n"
+      "fmla v23.8h, v28.8h, v3.h[5]\n"
+      "fmla v27.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x17, #0x190]\n"
+      "fmla v8.8h, v29.8h, v0.h[6]\n"
+      "fmla v12.8h, v29.8h, v1.h[6]\n"
+      "fmla v16.8h, v29.8h, v2.h[6]\n"
+      "fmla v20.8h, v29.8h, v3.h[6]\n"
+      "fmla v24.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x17, #0x1a0]\n"
+      "fmla v9.8h, v28.8h, v0.h[6]\n"
+      "fmla v13.8h, v28.8h, v1.h[6]\n"
+      "fmla v17.8h, v28.8h, v2.h[6]\n"
+      "fmla v21.8h, v28.8h, v3.h[6]\n"
+      "fmla v25.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x17, #0x1b0]\n"
+      "fmla v10.8h, v29.8h, v0.h[6]\n"
+      "fmla v14.8h, v29.8h, v1.h[6]\n"
+      "fmla v18.8h, v29.8h, v2.h[6]\n"
+      "fmla v22.8h, v29.8h, v3.h[6]\n"
+      "fmla v26.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x17, #0x1c0]\n"
+      "fmla v11.8h, v28.8h, v0.h[6]\n"
+      "fmla v15.8h, v28.8h, v1.h[6]\n"
+      "fmla v19.8h, v28.8h, v2.h[6]\n"
+      "fmla v23.8h, v28.8h, v3.h[6]\n"
+      "fmla v27.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x17, #0x1d0]\n"
+      "fmla v8.8h, v29.8h, v0.h[7]\n"
+      "fmla v12.8h, v29.8h, v1.h[7]\n"
+      "fmla v16.8h, v29.8h, v2.h[7]\n"
+      "fmla v20.8h, v29.8h, v3.h[7]\n"
+      "fmla v24.8h, v29.8h, v4.h[7]\n"
+      "ldr q29, [x17, #0x1e0]\n"
+      "fmla v9.8h, v28.8h, v0.h[7]\n"
+      "fmla v13.8h, v28.8h, v1.h[7]\n"
+      "fmla v17.8h, v28.8h, v2.h[7]\n"
+      "fmla v21.8h, v28.8h, v3.h[7]\n"
+      "fmla v25.8h, v28.8h, v4.h[7]\n"
+      "ldr q28, [x17, #0x1f0]\n"
+      "fmla v10.8h, v29.8h, v0.h[7]\n"
       "add x17, x17, #0x200\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v26.8h, v6.8h, v4.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
-      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "fmla v14.8h, v29.8h, v1.h[7]\n"
+      "fmla v18.8h, v29.8h, v2.h[7]\n"
+      "fmla v22.8h, v29.8h, v3.h[7]\n"
+      "fmla v26.8h, v29.8h, v4.h[7]\n"
+      "fmla v11.8h, v28.8h, v0.h[7]\n"
+      "fmla v15.8h, v28.8h, v1.h[7]\n"
+      "fmla v19.8h, v28.8h, v2.h[7]\n"
+      "fmla v23.8h, v28.8h, v3.h[7]\n"
+      "fmla v27.8h, v28.8h, v4.h[7]\n"
       "224:"  // Height 5: Multiply loop: Main loop skip
       "cbz x14, 226f\n"
       "225:"  // Height 5: Multiply loop: Odd block loop
-      "ldr h0, [x13], #0x2\n"
+      "ldr h4, [x13], #0x2\n"
       "sub x14, x14, #0x1\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
-      "ldr h3, [x25], #0x2\n"
-      "ldr h4, [x23], #0x2\n"
-      "ldr q6, [x17, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr h3, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "ldr h1, [x10], #0x2\n"
+      "ldr h0, [x9], #0x2\n"
+      "ldr q29, [x17, #0x0]\n"
+      "fmla v8.8h, v29.8h, v4.h[0]\n"
+      "ldr q28, [x17, #0x10]\n"
+      "fmla v12.8h, v29.8h, v3.h[0]\n"
+      "fmla v16.8h, v29.8h, v2.h[0]\n"
+      "fmla v20.8h, v29.8h, v1.h[0]\n"
+      "fmla v24.8h, v29.8h, v0.h[0]\n"
+      "ldr q29, [x17, #0x20]\n"
+      "fmla v9.8h, v28.8h, v4.h[0]\n"
+      "fmla v13.8h, v28.8h, v3.h[0]\n"
+      "fmla v17.8h, v28.8h, v2.h[0]\n"
+      "fmla v21.8h, v28.8h, v1.h[0]\n"
+      "fmla v25.8h, v28.8h, v0.h[0]\n"
+      "ldr q28, [x17, #0x30]\n"
+      "fmla v10.8h, v29.8h, v4.h[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "fmla v14.8h, v29.8h, v3.h[0]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "fmla v22.8h, v29.8h, v1.h[0]\n"
+      "fmla v26.8h, v29.8h, v0.h[0]\n"
+      "fmla v11.8h, v28.8h, v4.h[0]\n"
+      "fmla v15.8h, v28.8h, v3.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v1.h[0]\n"
+      "fmla v27.8h, v28.8h, v0.h[0]\n"
       "cbnz x14, 225b\n"
       "226:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -4069,49 +4069,49 @@
       "prfm pstl1keep, [x22, #0x0]\n"
       "tbz %x[flags], #1, 227f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v0.8h\n"
-      "fmin v9.8h, v9.8h, v0.8h\n"
-      "fmin v10.8h, v10.8h, v0.8h\n"
-      "fmin v11.8h, v11.8h, v0.8h\n"
-      "fmin v12.8h, v12.8h, v0.8h\n"
-      "fmin v13.8h, v13.8h, v0.8h\n"
-      "fmin v14.8h, v14.8h, v0.8h\n"
-      "fmin v15.8h, v15.8h, v0.8h\n"
-      "fmin v16.8h, v16.8h, v0.8h\n"
-      "fmin v17.8h, v17.8h, v0.8h\n"
-      "fmin v18.8h, v18.8h, v0.8h\n"
-      "fmin v19.8h, v19.8h, v0.8h\n"
-      "fmin v20.8h, v20.8h, v0.8h\n"
-      "fmin v21.8h, v21.8h, v0.8h\n"
-      "fmin v22.8h, v22.8h, v0.8h\n"
-      "fmin v23.8h, v23.8h, v0.8h\n"
-      "fmin v24.8h, v24.8h, v0.8h\n"
-      "fmin v25.8h, v25.8h, v0.8h\n"
-      "fmin v26.8h, v26.8h, v0.8h\n"
-      "fmin v27.8h, v27.8h, v0.8h\n"
+      "ld1r { v28.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v28.8h\n"
+      "fmin v9.8h, v9.8h, v28.8h\n"
+      "fmin v10.8h, v10.8h, v28.8h\n"
+      "fmin v11.8h, v11.8h, v28.8h\n"
+      "fmin v12.8h, v12.8h, v28.8h\n"
+      "fmin v13.8h, v13.8h, v28.8h\n"
+      "fmin v14.8h, v14.8h, v28.8h\n"
+      "fmin v15.8h, v15.8h, v28.8h\n"
+      "fmin v16.8h, v16.8h, v28.8h\n"
+      "fmin v17.8h, v17.8h, v28.8h\n"
+      "fmin v18.8h, v18.8h, v28.8h\n"
+      "fmin v19.8h, v19.8h, v28.8h\n"
+      "fmin v20.8h, v20.8h, v28.8h\n"
+      "fmin v21.8h, v21.8h, v28.8h\n"
+      "fmin v22.8h, v22.8h, v28.8h\n"
+      "fmin v23.8h, v23.8h, v28.8h\n"
+      "fmin v24.8h, v24.8h, v28.8h\n"
+      "fmin v25.8h, v25.8h, v28.8h\n"
+      "fmin v26.8h, v26.8h, v28.8h\n"
+      "fmin v27.8h, v27.8h, v28.8h\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v0.8h\n"
-      "fmax v13.8h, v13.8h, v0.8h\n"
-      "fmax v14.8h, v14.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v0.8h\n"
-      "fmax v16.8h, v16.8h, v0.8h\n"
-      "fmax v17.8h, v17.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v0.8h\n"
-      "fmax v19.8h, v19.8h, v0.8h\n"
-      "fmax v20.8h, v20.8h, v0.8h\n"
-      "fmax v21.8h, v21.8h, v0.8h\n"
-      "fmax v22.8h, v22.8h, v0.8h\n"
-      "fmax v23.8h, v23.8h, v0.8h\n"
-      "fmax v24.8h, v24.8h, v0.8h\n"
-      "fmax v25.8h, v25.8h, v0.8h\n"
-      "fmax v26.8h, v26.8h, v0.8h\n"
-      "fmax v27.8h, v27.8h, v0.8h\n"
+      "ld1r { v28.8h }, [x20]\n"
+      "fmax v8.8h, v8.8h, v28.8h\n"
+      "fmax v9.8h, v9.8h, v28.8h\n"
+      "fmax v10.8h, v10.8h, v28.8h\n"
+      "fmax v11.8h, v11.8h, v28.8h\n"
+      "fmax v12.8h, v12.8h, v28.8h\n"
+      "fmax v13.8h, v13.8h, v28.8h\n"
+      "fmax v14.8h, v14.8h, v28.8h\n"
+      "fmax v15.8h, v15.8h, v28.8h\n"
+      "fmax v16.8h, v16.8h, v28.8h\n"
+      "fmax v17.8h, v17.8h, v28.8h\n"
+      "fmax v18.8h, v18.8h, v28.8h\n"
+      "fmax v19.8h, v19.8h, v28.8h\n"
+      "fmax v20.8h, v20.8h, v28.8h\n"
+      "fmax v21.8h, v21.8h, v28.8h\n"
+      "fmax v22.8h, v22.8h, v28.8h\n"
+      "fmax v23.8h, v23.8h, v28.8h\n"
+      "fmax v24.8h, v24.8h, v28.8h\n"
+      "fmax v25.8h, v25.8h, v28.8h\n"
+      "fmax v26.8h, v26.8h, v28.8h\n"
+      "fmax v27.8h, v27.8h, v28.8h\n"
       "227:"  // Height 5: No activation
       "cmp x8, #0x20\n"
       "bge 244f\n"
@@ -4736,98 +4736,98 @@
       "268:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 269f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x23, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
+      "ldr x28, [x20, #0x28]\n"
       "cbnz x15, 270f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20, LSL #1\n"
+      "add x12, x12, x20, LSL #1\n"
+      "add x11, x11, x20, LSL #1\n"
+      "add x10, x10, x20, LSL #1\n"
       "add x9, x9, x20, LSL #1\n"
-      "add x27, x27, x20, LSL #1\n"
-      "add x25, x25, x20, LSL #1\n"
-      "add x23, x23, x20, LSL #1\n"
-      "add x21, x21, x20, LSL #1\n"
+      "add x28, x28, x20, LSL #1\n"
       "b 270f\n"
       "269:"  // Height 6: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20, LSL #1\n"
-      "add x27, x9, x20, LSL #1\n"
-      "add x25, x27, x20, LSL #1\n"
-      "add x23, x25, x20, LSL #1\n"
-      "add x21, x23, x20, LSL #1\n"
+      "add x12, x13, x21, LSL #1\n"
+      "add x11, x12, x21, LSL #1\n"
+      "add x10, x11, x21, LSL #1\n"
+      "add x9, x10, x21, LSL #1\n"
+      "add x28, x9, x21, LSL #1\n"
       "270:"  // Height 6: input setup done
       "cmp x14, #0x8\n"
       "blt 273f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x10\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x23, #0x0]\n"
-      "ldr q5, [x21, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
       "ldr q7, [x17, #0x10]\n"
       "blt 272f\n"
       "271:"  // Height 6: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr x12, [x17, #0x28]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v28.8h, v6.8h, v5.h[0]\n"
       "ldr d6, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr x12, [x17, #0x48]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "add x21, x21, #0x10\n"
+      "add x28, x28, #0x10\n"
       "fmla v29.8h, v7.8h, v5.h[0]\n"
       "ldr d7, [x17, #0x30]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.8h, v6.8h, v0.h[0]\n"
       "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr x11, [x17, #0x58]\n"
+      "ldr x20, [x17, #0x58]\n"
       "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "ldr x10, [x13, #0x8]\n"
+      "ldr x27, [x13, #0x8]\n"
       "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "ldr x28, [x9, #0x8]\n"
+      "ldr x26, [x12, #0x8]\n"
       "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "ldr x26, [x27, #0x8]\n"
+      "ldr x25, [x11, #0x8]\n"
       "fmla v30.8h, v6.8h, v5.h[0]\n"
       "ldr d6, [x17, #0x40]\n"
       "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr x12, [x17, #0x68]\n"
+      "ldr x21, [x17, #0x68]\n"
       "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "ldr x24, [x25, #0x8]\n"
+      "ldr x24, [x10, #0x8]\n"
       "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "ldr x22, [x23, #0x8]\n"
+      "ldr x23, [x9, #0x8]\n"
       "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "ldr x20, [x21, #0x8]\n"
+      "ldr x22, [x28, #0x8]\n"
       "fmla v31.8h, v7.8h, v5.h[0]\n"
       "ldr d7, [x17, #0x50]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.8h, v6.8h, v0.h[1]\n"
       "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr x11, [x17, #0x78]\n"
+      "ldr x20, [x17, #0x78]\n"
       "fmla v16.8h, v6.8h, v2.h[1]\n"
       "sub x14, x14, #0x8\n"
       "fmla v20.8h, v6.8h, v3.h[1]\n"
@@ -4837,240 +4837,240 @@
       "fmla v28.8h, v6.8h, v5.h[1]\n"
       "ldr d6, [x17, #0x60]\n"
       "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr x12, [x17, #0x88]\n"
+      "ldr x21, [x17, #0x88]\n"
       "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v25.8h, v7.8h, v4.h[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       "fmla v29.8h, v7.8h, v5.h[1]\n"
       "ldr d7, [x17, #0x70]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.8h, v6.8h, v0.h[1]\n"
       "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr x11, [x17, #0x98]\n"
+      "ldr x20, [x17, #0x98]\n"
       "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v26.8h, v6.8h, v4.h[1]\n"
       "fmla v30.8h, v6.8h, v5.h[1]\n"
       "ldr d6, [x17, #0x80]\n"
       "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr x12, [x17, #0xa8]\n"
+      "ldr x21, [x17, #0xa8]\n"
       "fmla v19.8h, v7.8h, v2.h[1]\n"
       "fmla v23.8h, v7.8h, v3.h[1]\n"
       "fmla v27.8h, v7.8h, v4.h[1]\n"
       "fmla v31.8h, v7.8h, v5.h[1]\n"
       "ldr d7, [x17, #0x90]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.8h, v6.8h, v0.h[2]\n"
       "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
+      "ldr x20, [x17, #0xb8]\n"
       "fmla v16.8h, v6.8h, v2.h[2]\n"
       "fmla v20.8h, v6.8h, v3.h[2]\n"
       "fmla v24.8h, v6.8h, v4.h[2]\n"
       "fmla v28.8h, v6.8h, v5.h[2]\n"
       "ldr d6, [x17, #0xa0]\n"
       "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "ldr x12, [x17, #0xc8]\n"
+      "ldr x21, [x17, #0xc8]\n"
       "fmla v17.8h, v7.8h, v2.h[2]\n"
       "fmla v21.8h, v7.8h, v3.h[2]\n"
       "fmla v25.8h, v7.8h, v4.h[2]\n"
       "fmla v29.8h, v7.8h, v5.h[2]\n"
       "ldr d7, [x17, #0xb0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.8h, v6.8h, v0.h[2]\n"
       "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
+      "ldr x20, [x17, #0xd8]\n"
       "fmla v18.8h, v6.8h, v2.h[2]\n"
       "fmla v22.8h, v6.8h, v3.h[2]\n"
       "fmla v26.8h, v6.8h, v4.h[2]\n"
       "fmla v30.8h, v6.8h, v5.h[2]\n"
       "ldr d6, [x17, #0xc0]\n"
       "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "ldr x12, [x17, #0xe8]\n"
+      "ldr x21, [x17, #0xe8]\n"
       "fmla v19.8h, v7.8h, v2.h[2]\n"
       "fmla v23.8h, v7.8h, v3.h[2]\n"
       "fmla v27.8h, v7.8h, v4.h[2]\n"
       "fmla v31.8h, v7.8h, v5.h[2]\n"
       "ldr d7, [x17, #0xd0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.8h, v6.8h, v0.h[3]\n"
       "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
+      "ldr x20, [x17, #0xf8]\n"
       "fmla v16.8h, v6.8h, v2.h[3]\n"
       "fmla v20.8h, v6.8h, v3.h[3]\n"
       "fmla v24.8h, v6.8h, v4.h[3]\n"
       "fmla v28.8h, v6.8h, v5.h[3]\n"
       "ldr d6, [x17, #0xe0]\n"
       "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "ldr x12, [x17, #0x108]\n"
+      "ldr x21, [x17, #0x108]\n"
       "fmla v17.8h, v7.8h, v2.h[3]\n"
       "fmla v21.8h, v7.8h, v3.h[3]\n"
       "fmla v25.8h, v7.8h, v4.h[3]\n"
       "fmla v29.8h, v7.8h, v5.h[3]\n"
       "ldr d7, [x17, #0xf0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.8h, v6.8h, v0.h[3]\n"
       "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "ldr x11, [x17, #0x118]\n"
+      "ldr x20, [x17, #0x118]\n"
       "fmla v18.8h, v6.8h, v2.h[3]\n"
       "fmla v22.8h, v6.8h, v3.h[3]\n"
       "fmla v26.8h, v6.8h, v4.h[3]\n"
       "fmla v30.8h, v6.8h, v5.h[3]\n"
       "ldr d6, [x17, #0x100]\n"
       "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "ldr x12, [x17, #0x128]\n"
+      "ldr x21, [x17, #0x128]\n"
       "fmla v19.8h, v7.8h, v2.h[3]\n"
       "fmla v23.8h, v7.8h, v3.h[3]\n"
       "fmla v27.8h, v7.8h, v4.h[3]\n"
       "fmla v31.8h, v7.8h, v5.h[3]\n"
       "ldr d7, [x17, #0x110]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.8h, v6.8h, v0.h[4]\n"
       "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "ldr x11, [x17, #0x138]\n"
+      "ldr x20, [x17, #0x138]\n"
       "fmla v16.8h, v6.8h, v2.h[4]\n"
       "fmla v20.8h, v6.8h, v3.h[4]\n"
       "fmla v24.8h, v6.8h, v4.h[4]\n"
       "fmla v28.8h, v6.8h, v5.h[4]\n"
       "ldr d6, [x17, #0x120]\n"
       "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "ldr x12, [x17, #0x148]\n"
+      "ldr x21, [x17, #0x148]\n"
       "fmla v17.8h, v7.8h, v2.h[4]\n"
       "fmla v21.8h, v7.8h, v3.h[4]\n"
       "fmla v25.8h, v7.8h, v4.h[4]\n"
       "fmla v29.8h, v7.8h, v5.h[4]\n"
       "ldr d7, [x17, #0x130]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.8h, v6.8h, v0.h[4]\n"
       "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "ldr x11, [x17, #0x158]\n"
+      "ldr x20, [x17, #0x158]\n"
       "fmla v18.8h, v6.8h, v2.h[4]\n"
       "fmla v22.8h, v6.8h, v3.h[4]\n"
       "fmla v26.8h, v6.8h, v4.h[4]\n"
       "fmla v30.8h, v6.8h, v5.h[4]\n"
       "ldr d6, [x17, #0x140]\n"
       "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "ldr x12, [x17, #0x168]\n"
+      "ldr x21, [x17, #0x168]\n"
       "fmla v19.8h, v7.8h, v2.h[4]\n"
       "fmla v23.8h, v7.8h, v3.h[4]\n"
       "fmla v27.8h, v7.8h, v4.h[4]\n"
       "fmla v31.8h, v7.8h, v5.h[4]\n"
       "ldr d7, [x17, #0x150]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.8h, v6.8h, v0.h[5]\n"
       "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "ldr x11, [x17, #0x178]\n"
+      "ldr x20, [x17, #0x178]\n"
       "fmla v16.8h, v6.8h, v2.h[5]\n"
       "fmla v20.8h, v6.8h, v3.h[5]\n"
       "fmla v24.8h, v6.8h, v4.h[5]\n"
       "fmla v28.8h, v6.8h, v5.h[5]\n"
       "ldr d6, [x17, #0x160]\n"
       "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "ldr x12, [x17, #0x188]\n"
+      "ldr x21, [x17, #0x188]\n"
       "fmla v17.8h, v7.8h, v2.h[5]\n"
       "fmla v21.8h, v7.8h, v3.h[5]\n"
       "fmla v25.8h, v7.8h, v4.h[5]\n"
       "fmla v29.8h, v7.8h, v5.h[5]\n"
       "ldr d7, [x17, #0x170]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.8h, v6.8h, v0.h[5]\n"
       "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "ldr x11, [x17, #0x198]\n"
+      "ldr x20, [x17, #0x198]\n"
       "fmla v18.8h, v6.8h, v2.h[5]\n"
       "fmla v22.8h, v6.8h, v3.h[5]\n"
       "fmla v26.8h, v6.8h, v4.h[5]\n"
       "fmla v30.8h, v6.8h, v5.h[5]\n"
       "ldr d6, [x17, #0x180]\n"
       "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "ldr x12, [x17, #0x1a8]\n"
+      "ldr x21, [x17, #0x1a8]\n"
       "fmla v19.8h, v7.8h, v2.h[5]\n"
       "fmla v23.8h, v7.8h, v3.h[5]\n"
       "fmla v27.8h, v7.8h, v4.h[5]\n"
       "fmla v31.8h, v7.8h, v5.h[5]\n"
       "ldr d7, [x17, #0x190]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.8h, v6.8h, v0.h[6]\n"
       "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "ldr x11, [x17, #0x1b8]\n"
+      "ldr x20, [x17, #0x1b8]\n"
       "fmla v16.8h, v6.8h, v2.h[6]\n"
       "fmla v20.8h, v6.8h, v3.h[6]\n"
       "fmla v24.8h, v6.8h, v4.h[6]\n"
       "fmla v28.8h, v6.8h, v5.h[6]\n"
       "ldr d6, [x17, #0x1a0]\n"
       "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "ldr x12, [x17, #0x1c8]\n"
+      "ldr x21, [x17, #0x1c8]\n"
       "fmla v17.8h, v7.8h, v2.h[6]\n"
       "fmla v21.8h, v7.8h, v3.h[6]\n"
       "fmla v25.8h, v7.8h, v4.h[6]\n"
       "fmla v29.8h, v7.8h, v5.h[6]\n"
       "ldr d7, [x17, #0x1b0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.8h, v6.8h, v0.h[6]\n"
       "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "ldr x11, [x17, #0x1d8]\n"
+      "ldr x20, [x17, #0x1d8]\n"
       "fmla v18.8h, v6.8h, v2.h[6]\n"
       "fmla v22.8h, v6.8h, v3.h[6]\n"
       "fmla v26.8h, v6.8h, v4.h[6]\n"
       "fmla v30.8h, v6.8h, v5.h[6]\n"
       "ldr d6, [x17, #0x1c0]\n"
       "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "ldr x12, [x17, #0x1e8]\n"
+      "ldr x21, [x17, #0x1e8]\n"
       "fmla v19.8h, v7.8h, v2.h[6]\n"
       "fmla v23.8h, v7.8h, v3.h[6]\n"
       "fmla v27.8h, v7.8h, v4.h[6]\n"
       "fmla v31.8h, v7.8h, v5.h[6]\n"
       "ldr d7, [x17, #0x1d0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.8h, v6.8h, v0.h[7]\n"
       "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "ldr x11, [x17, #0x1f8]\n"
+      "ldr x20, [x17, #0x1f8]\n"
       "fmla v16.8h, v6.8h, v2.h[7]\n"
       "fmla v20.8h, v6.8h, v3.h[7]\n"
       "fmla v24.8h, v6.8h, v4.h[7]\n"
       "fmla v28.8h, v6.8h, v5.h[7]\n"
       "ldr d6, [x17, #0x1e0]\n"
       "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.8h, v7.8h, v1.h[7]\n"
       "fmla v17.8h, v7.8h, v2.h[7]\n"
       "fmla v21.8h, v7.8h, v3.h[7]\n"
       "fmla v25.8h, v7.8h, v4.h[7]\n"
       "fmla v29.8h, v7.8h, v5.h[7]\n"
       "ldr d7, [x17, #0x1f0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "add x17, x17, #0x200\n"
       "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "ldr x12, [x17, #0x8]\n"
+      "ldr x21, [x17, #0x8]\n"
       "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "ldr x11, [x17, #0x18]\n"
+      "ldr x20, [x17, #0x18]\n"
       "fmla v18.8h, v6.8h, v2.h[7]\n"
       "fmla v22.8h, v6.8h, v3.h[7]\n"
       "fmla v26.8h, v6.8h, v4.h[7]\n"
@@ -5079,56 +5079,56 @@
       "fmla v11.8h, v7.8h, v0.h[7]\n"
       "ldr d0, [x13, #0x0]\n"
       "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d1, [x12, #0x0]\n"
       "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "ldr d2, [x11, #0x0]\n"
       "fmla v23.8h, v7.8h, v3.h[7]\n"
-      "ldr d3, [x25, #0x0]\n"
+      "ldr d3, [x10, #0x0]\n"
       "fmla v27.8h, v7.8h, v4.h[7]\n"
-      "ldr d4, [x23, #0x0]\n"
+      "ldr d4, [x9, #0x0]\n"
       "fmla v31.8h, v7.8h, v5.h[7]\n"
-      "ldr d5, [x21, #0x0]\n"
+      "ldr d5, [x28, #0x0]\n"
       "ldr d7, [x17, #0x10]\n"
-      "mov v6.d[1], x12\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x27\n"
+      "mov v1.d[1], x26\n"
+      "mov v2.d[1], x25\n"
       "mov v3.d[1], x24\n"
-      "mov v4.d[1], x22\n"
-      "mov v5.d[1], x20\n"
-      "mov v7.d[1], x11\n"
+      "mov v4.d[1], x23\n"
+      "mov v5.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 271b\n"
       "272:"  // Height 6: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v28.8h, v6.8h, v5.h[0]\n"
       "ldr q6, [x17, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "add x21, x21, #0x10\n"
+      "add x28, x28, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
       "sub x14, x14, #0x8\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v29.8h, v7.8h, v5.h[0]\n"
       "ldr q7, [x17, #0x30]\n"
       "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v22.8h, v6.8h, v3.h[0]\n"
       "fmla v26.8h, v6.8h, v4.h[0]\n"
       "fmla v30.8h, v6.8h, v5.h[0]\n"
@@ -5338,42 +5338,42 @@
       "273:"  // Height 6: Multiply loop: Main loop skip
       "cbz x14, 275f\n"
       "274:"  // Height 6: Multiply loop: Odd block loop
-      "ldr h0, [x13], #0x2\n"
+      "ldr h7, [x13], #0x2\n"
       "sub x14, x14, #0x1\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
-      "ldr h3, [x25], #0x2\n"
-      "ldr h4, [x23], #0x2\n"
-      "ldr h5, [x21], #0x2\n"
-      "ldr q6, [x17, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "fmla v28.8h, v6.8h, v5.h[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "fmla v29.8h, v7.8h, v5.h[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr h6, [x12], #0x2\n"
+      "ldr h5, [x11], #0x2\n"
+      "ldr h4, [x10], #0x2\n"
+      "ldr h3, [x9], #0x2\n"
+      "ldr h2, [x28], #0x2\n"
+      "ldr q1, [x17, #0x0]\n"
+      "fmla v8.8h, v1.8h, v7.h[0]\n"
+      "ldr q0, [x17, #0x10]\n"
+      "fmla v12.8h, v1.8h, v6.h[0]\n"
+      "fmla v16.8h, v1.8h, v5.h[0]\n"
+      "fmla v20.8h, v1.8h, v4.h[0]\n"
+      "fmla v24.8h, v1.8h, v3.h[0]\n"
+      "fmla v28.8h, v1.8h, v2.h[0]\n"
+      "ldr q1, [x17, #0x20]\n"
+      "fmla v9.8h, v0.8h, v7.h[0]\n"
+      "fmla v13.8h, v0.8h, v6.h[0]\n"
+      "fmla v17.8h, v0.8h, v5.h[0]\n"
+      "fmla v21.8h, v0.8h, v4.h[0]\n"
+      "fmla v25.8h, v0.8h, v3.h[0]\n"
+      "fmla v29.8h, v0.8h, v2.h[0]\n"
+      "ldr q0, [x17, #0x30]\n"
+      "fmla v10.8h, v1.8h, v7.h[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "fmla v30.8h, v6.8h, v5.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "fmla v31.8h, v7.8h, v5.h[0]\n"
+      "fmla v14.8h, v1.8h, v6.h[0]\n"
+      "fmla v18.8h, v1.8h, v5.h[0]\n"
+      "fmla v22.8h, v1.8h, v4.h[0]\n"
+      "fmla v26.8h, v1.8h, v3.h[0]\n"
+      "fmla v30.8h, v1.8h, v2.h[0]\n"
+      "fmla v11.8h, v0.8h, v7.h[0]\n"
+      "fmla v15.8h, v0.8h, v6.h[0]\n"
+      "fmla v19.8h, v0.8h, v5.h[0]\n"
+      "fmla v23.8h, v0.8h, v4.h[0]\n"
+      "fmla v27.8h, v0.8h, v3.h[0]\n"
+      "fmla v31.8h, v0.8h, v2.h[0]\n"
       "cbnz x14, 274b\n"
       "275:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -5743,7 +5743,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "296:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
index 3353087..8e5f600 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
@@ -244,11 +244,11 @@
       "23:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 24f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 25f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -265,69 +265,69 @@
       "blt 27f\n"
       "26:"  // Height 1: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0x100]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x10, #0x110]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x10, #0x120]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x10, #0x130]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x10, #0x140]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x10, #0x150]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x10, #0x160]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x10, #0x170]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x10, #0x180]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x10, #0x190]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x10, #0x1a0]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x10, #0x1b0]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x10, #0x1c0]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x10, #0x1d0]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "ldr q17, [x10, #0x1e0]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "ldr q16, [x10, #0x1f0]\n"
       "sub x27, x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
       "ldr q0, [x26, #0x0]\n"
       "cmp x27, #0x10\n"
       "add x10, x10, #0x200\n"
@@ -337,84 +337,84 @@
       "bge 26b\n"
       "27:"  // Height 1: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "ldr q17, [x10, #0x100]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "ldr q16, [x10, #0x110]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x10, #0x120]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x10, #0x130]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "ldr q17, [x10, #0x140]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "ldr q16, [x10, #0x150]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x10, #0x160]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x10, #0x170]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "ldr q17, [x10, #0x180]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "ldr q16, [x10, #0x190]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x10, #0x1a0]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x10, #0x1b0]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "ldr q17, [x10, #0x1c0]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "ldr q16, [x10, #0x1d0]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "ldr q17, [x10, #0x1e0]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "ldr q16, [x10, #0x1f0]\n"
       "add x26, x26, #0x10\n"
       "sub x27, x27, #0x8\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "add x10, x10, #0x200\n"
       "28:"  // Height 1: Multiply loop: Main loop skip
       "cbz x27, 30f\n"
       "29:"  // Height 1: Multiply loop: Odd block loop
       "ldr h0, [x26], #0x2\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x0]\n"
+      "fmla v8.8h, v16.8h, v0.h[0]\n"
       "sub x27, x27, #0x1\n"
-      "ldr q7, [x10, #0x10]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "ldr q16, [x10, #0x20]\n"
+      "fmla v9.8h, v17.8h, v0.h[0]\n"
+      "fmla v10.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
       "add x10, x10, #0x40\n"
       "cbnz x27, 29b\n"
       "30:"  // Height 1: Multiply loop: No odd multiplies
@@ -425,17 +425,17 @@
       "prfm pstl1keep, [x9, #0x0]\n"
       "tbz %x[flags], #1, 31f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
+      "ld1r { v17.8h }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v1.8h\n"
-      "fmin v9.8h, v9.8h, v1.8h\n"
-      "fmin v10.8h, v10.8h, v1.8h\n"
-      "fmin v11.8h, v11.8h, v1.8h\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v17.8h\n"
+      "fmin v9.8h, v9.8h, v17.8h\n"
+      "fmin v10.8h, v10.8h, v17.8h\n"
+      "fmin v11.8h, v11.8h, v17.8h\n"
+      "fmax v8.8h, v8.8h, v16.8h\n"
+      "fmax v9.8h, v9.8h, v16.8h\n"
+      "fmax v10.8h, v10.8h, v16.8h\n"
+      "fmax v11.8h, v11.8h, v16.8h\n"
       "31:"  // Height 1: No activation
       "cmp x11, #0x20\n"
       "bge 48f\n"
@@ -733,12 +733,12 @@
       "72:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 73f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 74f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -746,7 +746,7 @@
       "b 74f\n"
       "73:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
       "74:"  // Height 2: input setup done
       "cmp x27, #0x8\n"
       "blt 77f\n"
@@ -759,230 +759,230 @@
       "75:"  // Height 2: Multiply loop: Main loop head
       "fmla v8.8h, v6.8h, v0.h[0]\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       "sub x27, x27, #0x8\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q16, [x10, #0x30]\n"
       "add x26, x26, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "fmla v14.8h, v17.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
       "add x25, x25, #0x10\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "fmla v15.8h, v16.8h, v1.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
       "cmp x27, #0x10\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "fmla v12.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "fmla v13.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "fmla v14.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "fmla v15.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "fmla v12.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "fmla v13.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "fmla v14.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "fmla v15.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "fmla v12.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "fmla v13.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "fmla v14.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0x100]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "fmla v15.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x10, #0x110]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "fmla v12.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x10, #0x120]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "fmla v13.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x10, #0x130]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "fmla v14.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x10, #0x140]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "fmla v15.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x10, #0x150]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "fmla v12.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x10, #0x160]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "fmla v13.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x10, #0x170]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "fmla v14.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x10, #0x180]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "fmla v15.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x10, #0x190]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "fmla v12.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x10, #0x1a0]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "fmla v13.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x10, #0x1b0]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "fmla v14.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x10, #0x1c0]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "fmla v15.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x10, #0x1d0]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "fmla v12.8h, v17.8h, v1.h[7]\n"
+      "ldr q17, [x10, #0x1e0]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "fmla v13.8h, v16.8h, v1.h[7]\n"
+      "ldr q16, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v14.8h, v17.8h, v1.h[7]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v15.8h, v16.8h, v1.h[7]\n"
       "ldr q1, [x25, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 75b\n"
       "76:"  // Height 2: Multiply loop: Single iteration only
       "fmla v8.8h, v6.8h, v0.h[0]\n"
       "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       "add x26, x26, #0x10\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q16, [x10, #0x30]\n"
       "add x25, x25, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      "fmla v10.8h, v17.8h, v0.h[0]\n"
+      "fmla v14.8h, v17.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x40]\n"
       "sub x27, x27, #0x8\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      "fmla v11.8h, v16.8h, v0.h[0]\n"
+      "fmla v15.8h, v16.8h, v1.h[0]\n"
+      "ldr q16, [x10, #0x50]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
+      "fmla v8.8h, v17.8h, v0.h[1]\n"
+      "fmla v12.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x60]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "fmla v9.8h, v16.8h, v0.h[1]\n"
+      "fmla v13.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "fmla v10.8h, v17.8h, v0.h[1]\n"
+      "fmla v14.8h, v17.8h, v1.h[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.8h, v16.8h, v0.h[1]\n"
+      "fmla v15.8h, v16.8h, v1.h[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.8h, v17.8h, v0.h[2]\n"
+      "fmla v12.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.8h, v16.8h, v0.h[2]\n"
+      "fmla v13.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.8h, v17.8h, v0.h[2]\n"
+      "fmla v14.8h, v17.8h, v1.h[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.8h, v16.8h, v0.h[2]\n"
+      "fmla v15.8h, v16.8h, v1.h[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.8h, v17.8h, v0.h[3]\n"
+      "fmla v12.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.8h, v16.8h, v0.h[3]\n"
+      "fmla v13.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
+      "fmla v10.8h, v17.8h, v0.h[3]\n"
+      "fmla v14.8h, v17.8h, v1.h[3]\n"
+      "ldr q17, [x10, #0x100]\n"
+      "fmla v11.8h, v16.8h, v0.h[3]\n"
+      "fmla v15.8h, v16.8h, v1.h[3]\n"
+      "ldr q16, [x10, #0x110]\n"
+      "fmla v8.8h, v17.8h, v0.h[4]\n"
+      "fmla v12.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x10, #0x120]\n"
+      "fmla v9.8h, v16.8h, v0.h[4]\n"
+      "fmla v13.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x10, #0x130]\n"
+      "fmla v10.8h, v17.8h, v0.h[4]\n"
+      "fmla v14.8h, v17.8h, v1.h[4]\n"
+      "ldr q17, [x10, #0x140]\n"
+      "fmla v11.8h, v16.8h, v0.h[4]\n"
+      "fmla v15.8h, v16.8h, v1.h[4]\n"
+      "ldr q16, [x10, #0x150]\n"
+      "fmla v8.8h, v17.8h, v0.h[5]\n"
+      "fmla v12.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x10, #0x160]\n"
+      "fmla v9.8h, v16.8h, v0.h[5]\n"
+      "fmla v13.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x10, #0x170]\n"
+      "fmla v10.8h, v17.8h, v0.h[5]\n"
+      "fmla v14.8h, v17.8h, v1.h[5]\n"
+      "ldr q17, [x10, #0x180]\n"
+      "fmla v11.8h, v16.8h, v0.h[5]\n"
+      "fmla v15.8h, v16.8h, v1.h[5]\n"
+      "ldr q16, [x10, #0x190]\n"
+      "fmla v8.8h, v17.8h, v0.h[6]\n"
+      "fmla v12.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x10, #0x1a0]\n"
+      "fmla v9.8h, v16.8h, v0.h[6]\n"
+      "fmla v13.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x10, #0x1b0]\n"
+      "fmla v10.8h, v17.8h, v0.h[6]\n"
+      "fmla v14.8h, v17.8h, v1.h[6]\n"
+      "ldr q17, [x10, #0x1c0]\n"
+      "fmla v11.8h, v16.8h, v0.h[6]\n"
+      "fmla v15.8h, v16.8h, v1.h[6]\n"
+      "ldr q16, [x10, #0x1d0]\n"
+      "fmla v8.8h, v17.8h, v0.h[7]\n"
+      "fmla v12.8h, v17.8h, v1.h[7]\n"
+      "ldr q17, [x10, #0x1e0]\n"
+      "fmla v9.8h, v16.8h, v0.h[7]\n"
+      "fmla v13.8h, v16.8h, v1.h[7]\n"
+      "ldr q16, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v10.8h, v17.8h, v0.h[7]\n"
+      "fmla v14.8h, v17.8h, v1.h[7]\n"
+      "fmla v11.8h, v16.8h, v0.h[7]\n"
+      "fmla v15.8h, v16.8h, v1.h[7]\n"
       "77:"  // Height 2: Multiply loop: Main loop skip
       "cbz x27, 79f\n"
       "78:"  // Height 2: Multiply loop: Odd block loop
-      "ldr h0, [x26], #0x2\n"
-      "ldr h1, [x25], #0x2\n"
+      "ldr h1, [x26], #0x2\n"
+      "ldr h0, [x25], #0x2\n"
       "sub x27, x27, #0x1\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      "fmla v8.8h, v17.8h, v1.h[0]\n"
+      "fmla v12.8h, v17.8h, v0.h[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.8h, v16.8h, v1.h[0]\n"
+      "fmla v13.8h, v16.8h, v0.h[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v10.8h, v17.8h, v1.h[0]\n"
+      "fmla v14.8h, v17.8h, v0.h[0]\n"
       "add x10, x10, #0x40\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v11.8h, v16.8h, v1.h[0]\n"
+      "fmla v15.8h, v16.8h, v0.h[0]\n"
       "cbnz x27, 78b\n"
       "79:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -995,25 +995,25 @@
       "prfm pstl1keep, [x25, #0x0]\n"
       "tbz %x[flags], #1, 80f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
+      "ld1r { v17.8h }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v1.8h\n"
-      "fmin v9.8h, v9.8h, v1.8h\n"
-      "fmin v10.8h, v10.8h, v1.8h\n"
-      "fmin v11.8h, v11.8h, v1.8h\n"
-      "fmin v12.8h, v12.8h, v1.8h\n"
-      "fmin v13.8h, v13.8h, v1.8h\n"
-      "fmin v14.8h, v14.8h, v1.8h\n"
-      "fmin v15.8h, v15.8h, v1.8h\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v0.8h\n"
-      "fmax v13.8h, v13.8h, v0.8h\n"
-      "fmax v14.8h, v14.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v0.8h\n"
+      "ld1r { v16.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v17.8h\n"
+      "fmin v9.8h, v9.8h, v17.8h\n"
+      "fmin v10.8h, v10.8h, v17.8h\n"
+      "fmin v11.8h, v11.8h, v17.8h\n"
+      "fmin v12.8h, v12.8h, v17.8h\n"
+      "fmin v13.8h, v13.8h, v17.8h\n"
+      "fmin v14.8h, v14.8h, v17.8h\n"
+      "fmin v15.8h, v15.8h, v17.8h\n"
+      "fmax v8.8h, v8.8h, v16.8h\n"
+      "fmax v9.8h, v9.8h, v16.8h\n"
+      "fmax v10.8h, v10.8h, v16.8h\n"
+      "fmax v11.8h, v11.8h, v16.8h\n"
+      "fmax v12.8h, v12.8h, v16.8h\n"
+      "fmax v13.8h, v13.8h, v16.8h\n"
+      "fmax v14.8h, v14.8h, v16.8h\n"
+      "fmax v15.8h, v15.8h, v16.8h\n"
       "80:"  // Height 2: No activation
       "cmp x11, #0x20\n"
       "bge 97f\n"
@@ -1392,13 +1392,13 @@
       "121:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 122f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 123f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1407,8 +1407,8 @@
       "b 123f\n"
       "122:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "123:"  // Height 3: input setup done
       "cmp x27, #0x8\n"
       "blt 126f\n"
@@ -1425,139 +1425,139 @@
       "sub x27, x27, #0x8\n"
       "add x26, x26, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q21, [x10, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "add x25, x25, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x24, x24, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v10.8h, v21.8h, v0.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
       "cmp x27, #0x10\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v18.8h, v21.8h, v2.h[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      "fmla v11.8h, v20.8h, v0.h[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v2.h[0]\n"
+      "ldr q20, [x10, #0x50]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "fmla v8.8h, v21.8h, v0.h[1]\n"
+      "fmla v12.8h, v21.8h, v1.h[1]\n"
+      "fmla v16.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      "fmla v9.8h, v20.8h, v0.h[1]\n"
+      "fmla v13.8h, v20.8h, v1.h[1]\n"
+      "fmla v17.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      "fmla v10.8h, v21.8h, v0.h[1]\n"
+      "fmla v14.8h, v21.8h, v1.h[1]\n"
+      "fmla v18.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      "fmla v11.8h, v20.8h, v0.h[1]\n"
+      "fmla v15.8h, v20.8h, v1.h[1]\n"
+      "fmla v19.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      "fmla v8.8h, v21.8h, v0.h[2]\n"
+      "fmla v12.8h, v21.8h, v1.h[2]\n"
+      "fmla v16.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      "fmla v9.8h, v20.8h, v0.h[2]\n"
+      "fmla v13.8h, v20.8h, v1.h[2]\n"
+      "fmla v17.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      "fmla v10.8h, v21.8h, v0.h[2]\n"
+      "fmla v14.8h, v21.8h, v1.h[2]\n"
+      "fmla v18.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      "fmla v11.8h, v20.8h, v0.h[2]\n"
+      "fmla v15.8h, v20.8h, v1.h[2]\n"
+      "fmla v19.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      "fmla v8.8h, v21.8h, v0.h[3]\n"
+      "fmla v12.8h, v21.8h, v1.h[3]\n"
+      "fmla v16.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      "fmla v9.8h, v20.8h, v0.h[3]\n"
+      "fmla v13.8h, v20.8h, v1.h[3]\n"
+      "fmla v17.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
+      "fmla v10.8h, v21.8h, v0.h[3]\n"
+      "fmla v14.8h, v21.8h, v1.h[3]\n"
+      "fmla v18.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0x100]\n"
+      "fmla v11.8h, v20.8h, v0.h[3]\n"
+      "fmla v15.8h, v20.8h, v1.h[3]\n"
+      "fmla v19.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x10, #0x110]\n"
+      "fmla v8.8h, v21.8h, v0.h[4]\n"
+      "fmla v12.8h, v21.8h, v1.h[4]\n"
+      "fmla v16.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x10, #0x120]\n"
+      "fmla v9.8h, v20.8h, v0.h[4]\n"
+      "fmla v13.8h, v20.8h, v1.h[4]\n"
+      "fmla v17.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x10, #0x130]\n"
+      "fmla v10.8h, v21.8h, v0.h[4]\n"
+      "fmla v14.8h, v21.8h, v1.h[4]\n"
+      "fmla v18.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x10, #0x140]\n"
+      "fmla v11.8h, v20.8h, v0.h[4]\n"
+      "fmla v15.8h, v20.8h, v1.h[4]\n"
+      "fmla v19.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x10, #0x150]\n"
+      "fmla v8.8h, v21.8h, v0.h[5]\n"
+      "fmla v12.8h, v21.8h, v1.h[5]\n"
+      "fmla v16.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x10, #0x160]\n"
+      "fmla v9.8h, v20.8h, v0.h[5]\n"
+      "fmla v13.8h, v20.8h, v1.h[5]\n"
+      "fmla v17.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x10, #0x170]\n"
+      "fmla v10.8h, v21.8h, v0.h[5]\n"
+      "fmla v14.8h, v21.8h, v1.h[5]\n"
+      "fmla v18.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x10, #0x180]\n"
+      "fmla v11.8h, v20.8h, v0.h[5]\n"
+      "fmla v15.8h, v20.8h, v1.h[5]\n"
+      "fmla v19.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x10, #0x190]\n"
+      "fmla v8.8h, v21.8h, v0.h[6]\n"
+      "fmla v12.8h, v21.8h, v1.h[6]\n"
+      "fmla v16.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x10, #0x1a0]\n"
+      "fmla v9.8h, v20.8h, v0.h[6]\n"
+      "fmla v13.8h, v20.8h, v1.h[6]\n"
+      "fmla v17.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x10, #0x1b0]\n"
+      "fmla v10.8h, v21.8h, v0.h[6]\n"
+      "fmla v14.8h, v21.8h, v1.h[6]\n"
+      "fmla v18.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x10, #0x1c0]\n"
+      "fmla v11.8h, v20.8h, v0.h[6]\n"
+      "fmla v15.8h, v20.8h, v1.h[6]\n"
+      "fmla v19.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x10, #0x1d0]\n"
+      "fmla v8.8h, v21.8h, v0.h[7]\n"
+      "fmla v12.8h, v21.8h, v1.h[7]\n"
+      "fmla v16.8h, v21.8h, v2.h[7]\n"
+      "ldr q21, [x10, #0x1e0]\n"
+      "fmla v9.8h, v20.8h, v0.h[7]\n"
+      "fmla v13.8h, v20.8h, v1.h[7]\n"
+      "fmla v17.8h, v20.8h, v2.h[7]\n"
+      "ldr q20, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v10.8h, v21.8h, v0.h[7]\n"
+      "fmla v14.8h, v21.8h, v1.h[7]\n"
+      "fmla v18.8h, v21.8h, v2.h[7]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v11.8h, v20.8h, v0.h[7]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v15.8h, v20.8h, v1.h[7]\n"
       "ldr q1, [x25, #0x0]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v19.8h, v20.8h, v2.h[7]\n"
       "ldr q2, [x24, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 124b\n"
@@ -1567,159 +1567,159 @@
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q21, [x10, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "add x24, x24, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q20, [x10, #0x30]\n"
       "sub x27, x27, #0x8\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v10.8h, v21.8h, v0.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v18.8h, v21.8h, v2.h[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      "fmla v11.8h, v20.8h, v0.h[0]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v2.h[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      "fmla v8.8h, v21.8h, v0.h[1]\n"
+      "fmla v12.8h, v21.8h, v1.h[1]\n"
+      "fmla v16.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      "fmla v9.8h, v20.8h, v0.h[1]\n"
+      "fmla v13.8h, v20.8h, v1.h[1]\n"
+      "fmla v17.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      "fmla v10.8h, v21.8h, v0.h[1]\n"
+      "fmla v14.8h, v21.8h, v1.h[1]\n"
+      "fmla v18.8h, v21.8h, v2.h[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      "fmla v11.8h, v20.8h, v0.h[1]\n"
+      "fmla v15.8h, v20.8h, v1.h[1]\n"
+      "fmla v19.8h, v20.8h, v2.h[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      "fmla v8.8h, v21.8h, v0.h[2]\n"
+      "fmla v12.8h, v21.8h, v1.h[2]\n"
+      "fmla v16.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      "fmla v9.8h, v20.8h, v0.h[2]\n"
+      "fmla v13.8h, v20.8h, v1.h[2]\n"
+      "fmla v17.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      "fmla v10.8h, v21.8h, v0.h[2]\n"
+      "fmla v14.8h, v21.8h, v1.h[2]\n"
+      "fmla v18.8h, v21.8h, v2.h[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      "fmla v11.8h, v20.8h, v0.h[2]\n"
+      "fmla v15.8h, v20.8h, v1.h[2]\n"
+      "fmla v19.8h, v20.8h, v2.h[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      "fmla v8.8h, v21.8h, v0.h[3]\n"
+      "fmla v12.8h, v21.8h, v1.h[3]\n"
+      "fmla v16.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      "fmla v9.8h, v20.8h, v0.h[3]\n"
+      "fmla v13.8h, v20.8h, v1.h[3]\n"
+      "fmla v17.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
+      "fmla v10.8h, v21.8h, v0.h[3]\n"
+      "fmla v14.8h, v21.8h, v1.h[3]\n"
+      "fmla v18.8h, v21.8h, v2.h[3]\n"
+      "ldr q21, [x10, #0x100]\n"
+      "fmla v11.8h, v20.8h, v0.h[3]\n"
+      "fmla v15.8h, v20.8h, v1.h[3]\n"
+      "fmla v19.8h, v20.8h, v2.h[3]\n"
+      "ldr q20, [x10, #0x110]\n"
+      "fmla v8.8h, v21.8h, v0.h[4]\n"
+      "fmla v12.8h, v21.8h, v1.h[4]\n"
+      "fmla v16.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x10, #0x120]\n"
+      "fmla v9.8h, v20.8h, v0.h[4]\n"
+      "fmla v13.8h, v20.8h, v1.h[4]\n"
+      "fmla v17.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x10, #0x130]\n"
+      "fmla v10.8h, v21.8h, v0.h[4]\n"
+      "fmla v14.8h, v21.8h, v1.h[4]\n"
+      "fmla v18.8h, v21.8h, v2.h[4]\n"
+      "ldr q21, [x10, #0x140]\n"
+      "fmla v11.8h, v20.8h, v0.h[4]\n"
+      "fmla v15.8h, v20.8h, v1.h[4]\n"
+      "fmla v19.8h, v20.8h, v2.h[4]\n"
+      "ldr q20, [x10, #0x150]\n"
+      "fmla v8.8h, v21.8h, v0.h[5]\n"
+      "fmla v12.8h, v21.8h, v1.h[5]\n"
+      "fmla v16.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x10, #0x160]\n"
+      "fmla v9.8h, v20.8h, v0.h[5]\n"
+      "fmla v13.8h, v20.8h, v1.h[5]\n"
+      "fmla v17.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x10, #0x170]\n"
+      "fmla v10.8h, v21.8h, v0.h[5]\n"
+      "fmla v14.8h, v21.8h, v1.h[5]\n"
+      "fmla v18.8h, v21.8h, v2.h[5]\n"
+      "ldr q21, [x10, #0x180]\n"
+      "fmla v11.8h, v20.8h, v0.h[5]\n"
+      "fmla v15.8h, v20.8h, v1.h[5]\n"
+      "fmla v19.8h, v20.8h, v2.h[5]\n"
+      "ldr q20, [x10, #0x190]\n"
+      "fmla v8.8h, v21.8h, v0.h[6]\n"
+      "fmla v12.8h, v21.8h, v1.h[6]\n"
+      "fmla v16.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x10, #0x1a0]\n"
+      "fmla v9.8h, v20.8h, v0.h[6]\n"
+      "fmla v13.8h, v20.8h, v1.h[6]\n"
+      "fmla v17.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x10, #0x1b0]\n"
+      "fmla v10.8h, v21.8h, v0.h[6]\n"
+      "fmla v14.8h, v21.8h, v1.h[6]\n"
+      "fmla v18.8h, v21.8h, v2.h[6]\n"
+      "ldr q21, [x10, #0x1c0]\n"
+      "fmla v11.8h, v20.8h, v0.h[6]\n"
+      "fmla v15.8h, v20.8h, v1.h[6]\n"
+      "fmla v19.8h, v20.8h, v2.h[6]\n"
+      "ldr q20, [x10, #0x1d0]\n"
+      "fmla v8.8h, v21.8h, v0.h[7]\n"
+      "fmla v12.8h, v21.8h, v1.h[7]\n"
+      "fmla v16.8h, v21.8h, v2.h[7]\n"
+      "ldr q21, [x10, #0x1e0]\n"
+      "fmla v9.8h, v20.8h, v0.h[7]\n"
+      "fmla v13.8h, v20.8h, v1.h[7]\n"
+      "fmla v17.8h, v20.8h, v2.h[7]\n"
+      "ldr q20, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v10.8h, v21.8h, v0.h[7]\n"
+      "fmla v14.8h, v21.8h, v1.h[7]\n"
+      "fmla v18.8h, v21.8h, v2.h[7]\n"
+      "fmla v11.8h, v20.8h, v0.h[7]\n"
+      "fmla v15.8h, v20.8h, v1.h[7]\n"
+      "fmla v19.8h, v20.8h, v2.h[7]\n"
       "126:"  // Height 3: Multiply loop: Main loop skip
       "cbz x27, 128f\n"
       "127:"  // Height 3: Multiply loop: Odd block loop
-      "ldr h0, [x26], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
       "ldr h1, [x25], #0x2\n"
       "sub x27, x27, #0x1\n"
-      "ldr h2, [x24], #0x2\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr h0, [x24], #0x2\n"
+      "ldr q21, [x10, #0x0]\n"
+      "fmla v8.8h, v21.8h, v2.h[0]\n"
+      "fmla v12.8h, v21.8h, v1.h[0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      "fmla v16.8h, v21.8h, v0.h[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      "fmla v9.8h, v20.8h, v2.h[0]\n"
+      "fmla v13.8h, v20.8h, v1.h[0]\n"
+      "fmla v17.8h, v20.8h, v0.h[0]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v10.8h, v21.8h, v2.h[0]\n"
+      "fmla v14.8h, v21.8h, v1.h[0]\n"
+      "fmla v18.8h, v21.8h, v0.h[0]\n"
+      "fmla v11.8h, v20.8h, v2.h[0]\n"
+      "fmla v15.8h, v20.8h, v1.h[0]\n"
+      "fmla v19.8h, v20.8h, v0.h[0]\n"
       "cbnz x27, 127b\n"
       "128:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1734,33 +1734,33 @@
       "prfm pstl1keep, [x24, #0x0]\n"
       "tbz %x[flags], #1, 129f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
+      "ld1r { v21.8h }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v1.8h\n"
-      "fmin v9.8h, v9.8h, v1.8h\n"
-      "fmin v10.8h, v10.8h, v1.8h\n"
-      "fmin v11.8h, v11.8h, v1.8h\n"
-      "fmin v12.8h, v12.8h, v1.8h\n"
-      "fmin v13.8h, v13.8h, v1.8h\n"
-      "fmin v14.8h, v14.8h, v1.8h\n"
-      "fmin v15.8h, v15.8h, v1.8h\n"
-      "fmin v16.8h, v16.8h, v1.8h\n"
-      "fmin v17.8h, v17.8h, v1.8h\n"
-      "fmin v18.8h, v18.8h, v1.8h\n"
-      "fmin v19.8h, v19.8h, v1.8h\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v0.8h\n"
-      "fmax v13.8h, v13.8h, v0.8h\n"
-      "fmax v14.8h, v14.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v0.8h\n"
-      "fmax v16.8h, v16.8h, v0.8h\n"
-      "fmax v17.8h, v17.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v0.8h\n"
-      "fmax v19.8h, v19.8h, v0.8h\n"
+      "ld1r { v20.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v21.8h\n"
+      "fmin v9.8h, v9.8h, v21.8h\n"
+      "fmin v10.8h, v10.8h, v21.8h\n"
+      "fmin v11.8h, v11.8h, v21.8h\n"
+      "fmin v12.8h, v12.8h, v21.8h\n"
+      "fmin v13.8h, v13.8h, v21.8h\n"
+      "fmin v14.8h, v14.8h, v21.8h\n"
+      "fmin v15.8h, v15.8h, v21.8h\n"
+      "fmin v16.8h, v16.8h, v21.8h\n"
+      "fmin v17.8h, v17.8h, v21.8h\n"
+      "fmin v18.8h, v18.8h, v21.8h\n"
+      "fmin v19.8h, v19.8h, v21.8h\n"
+      "fmax v8.8h, v8.8h, v20.8h\n"
+      "fmax v9.8h, v9.8h, v20.8h\n"
+      "fmax v10.8h, v10.8h, v20.8h\n"
+      "fmax v11.8h, v11.8h, v20.8h\n"
+      "fmax v12.8h, v12.8h, v20.8h\n"
+      "fmax v13.8h, v13.8h, v20.8h\n"
+      "fmax v14.8h, v14.8h, v20.8h\n"
+      "fmax v15.8h, v15.8h, v20.8h\n"
+      "fmax v16.8h, v16.8h, v20.8h\n"
+      "fmax v17.8h, v17.8h, v20.8h\n"
+      "fmax v18.8h, v18.8h, v20.8h\n"
+      "fmax v19.8h, v19.8h, v20.8h\n"
       "129:"  // Height 3: No activation
       "cmp x11, #0x20\n"
       "bge 146f\n"
@@ -2220,14 +2220,14 @@
       "170:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 171f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 172f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -2237,9 +2237,9 @@
       "b 172f\n"
       "171:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "172:"  // Height 4: input setup done
       "cmp x27, #0x8\n"
       "blt 175f\n"
@@ -2258,7 +2258,7 @@
       "add x26, x26, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x20]\n"
       "add x25, x25, #0x10\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -2266,165 +2266,165 @@
       "add x23, x23, #0x10\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q24, [x10, #0x30]\n"
       "cmp x27, #0x10\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v10.8h, v25.8h, v0.h[0]\n"
+      "fmla v14.8h, v25.8h, v1.h[0]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      "fmla v18.8h, v25.8h, v2.h[0]\n"
+      "fmla v22.8h, v25.8h, v3.h[0]\n"
+      "ldr q25, [x10, #0x40]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v11.8h, v24.8h, v0.h[0]\n"
+      "fmla v15.8h, v24.8h, v1.h[0]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "fmla v19.8h, v24.8h, v2.h[0]\n"
+      "fmla v23.8h, v24.8h, v3.h[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      "fmla v8.8h, v25.8h, v0.h[1]\n"
+      "fmla v12.8h, v25.8h, v1.h[1]\n"
+      "fmla v16.8h, v25.8h, v2.h[1]\n"
+      "fmla v20.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      "fmla v9.8h, v24.8h, v0.h[1]\n"
+      "fmla v13.8h, v24.8h, v1.h[1]\n"
+      "fmla v17.8h, v24.8h, v2.h[1]\n"
+      "fmla v21.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      "fmla v10.8h, v25.8h, v0.h[1]\n"
+      "fmla v14.8h, v25.8h, v1.h[1]\n"
+      "fmla v18.8h, v25.8h, v2.h[1]\n"
+      "fmla v22.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      "fmla v11.8h, v24.8h, v0.h[1]\n"
+      "fmla v15.8h, v24.8h, v1.h[1]\n"
+      "fmla v19.8h, v24.8h, v2.h[1]\n"
+      "fmla v23.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      "fmla v8.8h, v25.8h, v0.h[2]\n"
+      "fmla v12.8h, v25.8h, v1.h[2]\n"
+      "fmla v16.8h, v25.8h, v2.h[2]\n"
+      "fmla v20.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      "fmla v9.8h, v24.8h, v0.h[2]\n"
+      "fmla v13.8h, v24.8h, v1.h[2]\n"
+      "fmla v17.8h, v24.8h, v2.h[2]\n"
+      "fmla v21.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      "fmla v10.8h, v25.8h, v0.h[2]\n"
+      "fmla v14.8h, v25.8h, v1.h[2]\n"
+      "fmla v18.8h, v25.8h, v2.h[2]\n"
+      "fmla v22.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      "fmla v11.8h, v24.8h, v0.h[2]\n"
+      "fmla v15.8h, v24.8h, v1.h[2]\n"
+      "fmla v19.8h, v24.8h, v2.h[2]\n"
+      "fmla v23.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      "fmla v8.8h, v25.8h, v0.h[3]\n"
+      "fmla v12.8h, v25.8h, v1.h[3]\n"
+      "fmla v16.8h, v25.8h, v2.h[3]\n"
+      "fmla v20.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      "fmla v9.8h, v24.8h, v0.h[3]\n"
+      "fmla v13.8h, v24.8h, v1.h[3]\n"
+      "fmla v17.8h, v24.8h, v2.h[3]\n"
+      "fmla v21.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
+      "fmla v10.8h, v25.8h, v0.h[3]\n"
+      "fmla v14.8h, v25.8h, v1.h[3]\n"
+      "fmla v18.8h, v25.8h, v2.h[3]\n"
+      "fmla v22.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0x100]\n"
+      "fmla v11.8h, v24.8h, v0.h[3]\n"
+      "fmla v15.8h, v24.8h, v1.h[3]\n"
+      "fmla v19.8h, v24.8h, v2.h[3]\n"
+      "fmla v23.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x10, #0x110]\n"
+      "fmla v8.8h, v25.8h, v0.h[4]\n"
+      "fmla v12.8h, v25.8h, v1.h[4]\n"
+      "fmla v16.8h, v25.8h, v2.h[4]\n"
+      "fmla v20.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x10, #0x120]\n"
+      "fmla v9.8h, v24.8h, v0.h[4]\n"
+      "fmla v13.8h, v24.8h, v1.h[4]\n"
+      "fmla v17.8h, v24.8h, v2.h[4]\n"
+      "fmla v21.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x10, #0x130]\n"
+      "fmla v10.8h, v25.8h, v0.h[4]\n"
+      "fmla v14.8h, v25.8h, v1.h[4]\n"
+      "fmla v18.8h, v25.8h, v2.h[4]\n"
+      "fmla v22.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x10, #0x140]\n"
+      "fmla v11.8h, v24.8h, v0.h[4]\n"
+      "fmla v15.8h, v24.8h, v1.h[4]\n"
+      "fmla v19.8h, v24.8h, v2.h[4]\n"
+      "fmla v23.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x10, #0x150]\n"
+      "fmla v8.8h, v25.8h, v0.h[5]\n"
+      "fmla v12.8h, v25.8h, v1.h[5]\n"
+      "fmla v16.8h, v25.8h, v2.h[5]\n"
+      "fmla v20.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x10, #0x160]\n"
+      "fmla v9.8h, v24.8h, v0.h[5]\n"
+      "fmla v13.8h, v24.8h, v1.h[5]\n"
+      "fmla v17.8h, v24.8h, v2.h[5]\n"
+      "fmla v21.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x10, #0x170]\n"
+      "fmla v10.8h, v25.8h, v0.h[5]\n"
+      "fmla v14.8h, v25.8h, v1.h[5]\n"
+      "fmla v18.8h, v25.8h, v2.h[5]\n"
+      "fmla v22.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x10, #0x180]\n"
+      "fmla v11.8h, v24.8h, v0.h[5]\n"
+      "fmla v15.8h, v24.8h, v1.h[5]\n"
+      "fmla v19.8h, v24.8h, v2.h[5]\n"
+      "fmla v23.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x10, #0x190]\n"
+      "fmla v8.8h, v25.8h, v0.h[6]\n"
+      "fmla v12.8h, v25.8h, v1.h[6]\n"
+      "fmla v16.8h, v25.8h, v2.h[6]\n"
+      "fmla v20.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x10, #0x1a0]\n"
+      "fmla v9.8h, v24.8h, v0.h[6]\n"
+      "fmla v13.8h, v24.8h, v1.h[6]\n"
+      "fmla v17.8h, v24.8h, v2.h[6]\n"
+      "fmla v21.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x10, #0x1b0]\n"
+      "fmla v10.8h, v25.8h, v0.h[6]\n"
+      "fmla v14.8h, v25.8h, v1.h[6]\n"
+      "fmla v18.8h, v25.8h, v2.h[6]\n"
+      "fmla v22.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x10, #0x1c0]\n"
+      "fmla v11.8h, v24.8h, v0.h[6]\n"
+      "fmla v15.8h, v24.8h, v1.h[6]\n"
+      "fmla v19.8h, v24.8h, v2.h[6]\n"
+      "fmla v23.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x10, #0x1d0]\n"
+      "fmla v8.8h, v25.8h, v0.h[7]\n"
+      "fmla v12.8h, v25.8h, v1.h[7]\n"
+      "fmla v16.8h, v25.8h, v2.h[7]\n"
+      "fmla v20.8h, v25.8h, v3.h[7]\n"
+      "ldr q25, [x10, #0x1e0]\n"
+      "fmla v9.8h, v24.8h, v0.h[7]\n"
+      "fmla v13.8h, v24.8h, v1.h[7]\n"
+      "fmla v17.8h, v24.8h, v2.h[7]\n"
+      "fmla v21.8h, v24.8h, v3.h[7]\n"
+      "ldr q24, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v10.8h, v25.8h, v0.h[7]\n"
+      "fmla v14.8h, v25.8h, v1.h[7]\n"
+      "fmla v18.8h, v25.8h, v2.h[7]\n"
+      "fmla v22.8h, v25.8h, v3.h[7]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v11.8h, v24.8h, v0.h[7]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v15.8h, v24.8h, v1.h[7]\n"
       "ldr q1, [x25, #0x0]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v19.8h, v24.8h, v2.h[7]\n"
       "ldr q2, [x24, #0x0]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v23.8h, v24.8h, v3.h[7]\n"
       "ldr q3, [x23, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 173b\n"
@@ -2435,7 +2435,7 @@
       "add x25, x25, #0x10\n"
       "fmla v16.8h, v6.8h, v2.h[0]\n"
       "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x20]\n"
       "add x24, x24, #0x10\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -2443,189 +2443,189 @@
       "sub x27, x27, #0x8\n"
       "fmla v17.8h, v7.8h, v2.h[0]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q24, [x10, #0x30]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v10.8h, v25.8h, v0.h[0]\n"
+      "fmla v14.8h, v25.8h, v1.h[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      "fmla v18.8h, v25.8h, v2.h[0]\n"
+      "fmla v22.8h, v25.8h, v3.h[0]\n"
+      "ldr q25, [x10, #0x40]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "fmla v11.8h, v24.8h, v0.h[0]\n"
+      "fmla v15.8h, v24.8h, v1.h[0]\n"
+      "fmla v19.8h, v24.8h, v2.h[0]\n"
+      "fmla v23.8h, v24.8h, v3.h[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      "fmla v8.8h, v25.8h, v0.h[1]\n"
+      "fmla v12.8h, v25.8h, v1.h[1]\n"
+      "fmla v16.8h, v25.8h, v2.h[1]\n"
+      "fmla v20.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      "fmla v9.8h, v24.8h, v0.h[1]\n"
+      "fmla v13.8h, v24.8h, v1.h[1]\n"
+      "fmla v17.8h, v24.8h, v2.h[1]\n"
+      "fmla v21.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      "fmla v10.8h, v25.8h, v0.h[1]\n"
+      "fmla v14.8h, v25.8h, v1.h[1]\n"
+      "fmla v18.8h, v25.8h, v2.h[1]\n"
+      "fmla v22.8h, v25.8h, v3.h[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      "fmla v11.8h, v24.8h, v0.h[1]\n"
+      "fmla v15.8h, v24.8h, v1.h[1]\n"
+      "fmla v19.8h, v24.8h, v2.h[1]\n"
+      "fmla v23.8h, v24.8h, v3.h[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      "fmla v8.8h, v25.8h, v0.h[2]\n"
+      "fmla v12.8h, v25.8h, v1.h[2]\n"
+      "fmla v16.8h, v25.8h, v2.h[2]\n"
+      "fmla v20.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      "fmla v9.8h, v24.8h, v0.h[2]\n"
+      "fmla v13.8h, v24.8h, v1.h[2]\n"
+      "fmla v17.8h, v24.8h, v2.h[2]\n"
+      "fmla v21.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      "fmla v10.8h, v25.8h, v0.h[2]\n"
+      "fmla v14.8h, v25.8h, v1.h[2]\n"
+      "fmla v18.8h, v25.8h, v2.h[2]\n"
+      "fmla v22.8h, v25.8h, v3.h[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      "fmla v11.8h, v24.8h, v0.h[2]\n"
+      "fmla v15.8h, v24.8h, v1.h[2]\n"
+      "fmla v19.8h, v24.8h, v2.h[2]\n"
+      "fmla v23.8h, v24.8h, v3.h[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      "fmla v8.8h, v25.8h, v0.h[3]\n"
+      "fmla v12.8h, v25.8h, v1.h[3]\n"
+      "fmla v16.8h, v25.8h, v2.h[3]\n"
+      "fmla v20.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      "fmla v9.8h, v24.8h, v0.h[3]\n"
+      "fmla v13.8h, v24.8h, v1.h[3]\n"
+      "fmla v17.8h, v24.8h, v2.h[3]\n"
+      "fmla v21.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
+      "fmla v10.8h, v25.8h, v0.h[3]\n"
+      "fmla v14.8h, v25.8h, v1.h[3]\n"
+      "fmla v18.8h, v25.8h, v2.h[3]\n"
+      "fmla v22.8h, v25.8h, v3.h[3]\n"
+      "ldr q25, [x10, #0x100]\n"
+      "fmla v11.8h, v24.8h, v0.h[3]\n"
+      "fmla v15.8h, v24.8h, v1.h[3]\n"
+      "fmla v19.8h, v24.8h, v2.h[3]\n"
+      "fmla v23.8h, v24.8h, v3.h[3]\n"
+      "ldr q24, [x10, #0x110]\n"
+      "fmla v8.8h, v25.8h, v0.h[4]\n"
+      "fmla v12.8h, v25.8h, v1.h[4]\n"
+      "fmla v16.8h, v25.8h, v2.h[4]\n"
+      "fmla v20.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x10, #0x120]\n"
+      "fmla v9.8h, v24.8h, v0.h[4]\n"
+      "fmla v13.8h, v24.8h, v1.h[4]\n"
+      "fmla v17.8h, v24.8h, v2.h[4]\n"
+      "fmla v21.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x10, #0x130]\n"
+      "fmla v10.8h, v25.8h, v0.h[4]\n"
+      "fmla v14.8h, v25.8h, v1.h[4]\n"
+      "fmla v18.8h, v25.8h, v2.h[4]\n"
+      "fmla v22.8h, v25.8h, v3.h[4]\n"
+      "ldr q25, [x10, #0x140]\n"
+      "fmla v11.8h, v24.8h, v0.h[4]\n"
+      "fmla v15.8h, v24.8h, v1.h[4]\n"
+      "fmla v19.8h, v24.8h, v2.h[4]\n"
+      "fmla v23.8h, v24.8h, v3.h[4]\n"
+      "ldr q24, [x10, #0x150]\n"
+      "fmla v8.8h, v25.8h, v0.h[5]\n"
+      "fmla v12.8h, v25.8h, v1.h[5]\n"
+      "fmla v16.8h, v25.8h, v2.h[5]\n"
+      "fmla v20.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x10, #0x160]\n"
+      "fmla v9.8h, v24.8h, v0.h[5]\n"
+      "fmla v13.8h, v24.8h, v1.h[5]\n"
+      "fmla v17.8h, v24.8h, v2.h[5]\n"
+      "fmla v21.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x10, #0x170]\n"
+      "fmla v10.8h, v25.8h, v0.h[5]\n"
+      "fmla v14.8h, v25.8h, v1.h[5]\n"
+      "fmla v18.8h, v25.8h, v2.h[5]\n"
+      "fmla v22.8h, v25.8h, v3.h[5]\n"
+      "ldr q25, [x10, #0x180]\n"
+      "fmla v11.8h, v24.8h, v0.h[5]\n"
+      "fmla v15.8h, v24.8h, v1.h[5]\n"
+      "fmla v19.8h, v24.8h, v2.h[5]\n"
+      "fmla v23.8h, v24.8h, v3.h[5]\n"
+      "ldr q24, [x10, #0x190]\n"
+      "fmla v8.8h, v25.8h, v0.h[6]\n"
+      "fmla v12.8h, v25.8h, v1.h[6]\n"
+      "fmla v16.8h, v25.8h, v2.h[6]\n"
+      "fmla v20.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x10, #0x1a0]\n"
+      "fmla v9.8h, v24.8h, v0.h[6]\n"
+      "fmla v13.8h, v24.8h, v1.h[6]\n"
+      "fmla v17.8h, v24.8h, v2.h[6]\n"
+      "fmla v21.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x10, #0x1b0]\n"
+      "fmla v10.8h, v25.8h, v0.h[6]\n"
+      "fmla v14.8h, v25.8h, v1.h[6]\n"
+      "fmla v18.8h, v25.8h, v2.h[6]\n"
+      "fmla v22.8h, v25.8h, v3.h[6]\n"
+      "ldr q25, [x10, #0x1c0]\n"
+      "fmla v11.8h, v24.8h, v0.h[6]\n"
+      "fmla v15.8h, v24.8h, v1.h[6]\n"
+      "fmla v19.8h, v24.8h, v2.h[6]\n"
+      "fmla v23.8h, v24.8h, v3.h[6]\n"
+      "ldr q24, [x10, #0x1d0]\n"
+      "fmla v8.8h, v25.8h, v0.h[7]\n"
+      "fmla v12.8h, v25.8h, v1.h[7]\n"
+      "fmla v16.8h, v25.8h, v2.h[7]\n"
+      "fmla v20.8h, v25.8h, v3.h[7]\n"
+      "ldr q25, [x10, #0x1e0]\n"
+      "fmla v9.8h, v24.8h, v0.h[7]\n"
+      "fmla v13.8h, v24.8h, v1.h[7]\n"
+      "fmla v17.8h, v24.8h, v2.h[7]\n"
+      "fmla v21.8h, v24.8h, v3.h[7]\n"
+      "ldr q24, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v10.8h, v25.8h, v0.h[7]\n"
+      "fmla v14.8h, v25.8h, v1.h[7]\n"
+      "fmla v18.8h, v25.8h, v2.h[7]\n"
+      "fmla v22.8h, v25.8h, v3.h[7]\n"
+      "fmla v11.8h, v24.8h, v0.h[7]\n"
+      "fmla v15.8h, v24.8h, v1.h[7]\n"
+      "fmla v19.8h, v24.8h, v2.h[7]\n"
+      "fmla v23.8h, v24.8h, v3.h[7]\n"
       "175:"  // Height 4: Multiply loop: Main loop skip
       "cbz x27, 177f\n"
       "176:"  // Height 4: Multiply loop: Odd block loop
-      "ldr h0, [x26], #0x2\n"
-      "ldr h1, [x25], #0x2\n"
+      "ldr h3, [x26], #0x2\n"
+      "ldr h2, [x25], #0x2\n"
       "sub x27, x27, #0x1\n"
-      "ldr h2, [x24], #0x2\n"
-      "ldr h3, [x23], #0x2\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h0, [x23], #0x2\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      "fmla v8.8h, v25.8h, v3.h[0]\n"
+      "fmla v12.8h, v25.8h, v2.h[0]\n"
+      "fmla v16.8h, v25.8h, v1.h[0]\n"
+      "fmla v20.8h, v25.8h, v0.h[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      "fmla v9.8h, v24.8h, v3.h[0]\n"
+      "fmla v13.8h, v24.8h, v2.h[0]\n"
+      "fmla v17.8h, v24.8h, v1.h[0]\n"
+      "fmla v21.8h, v24.8h, v0.h[0]\n"
+      "ldr q24, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v10.8h, v25.8h, v3.h[0]\n"
+      "fmla v14.8h, v25.8h, v2.h[0]\n"
+      "fmla v18.8h, v25.8h, v1.h[0]\n"
+      "fmla v22.8h, v25.8h, v0.h[0]\n"
+      "fmla v11.8h, v24.8h, v3.h[0]\n"
+      "fmla v15.8h, v24.8h, v2.h[0]\n"
+      "fmla v19.8h, v24.8h, v1.h[0]\n"
+      "fmla v23.8h, v24.8h, v0.h[0]\n"
       "cbnz x27, 176b\n"
       "177:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2642,41 +2642,41 @@
       "prfm pstl1keep, [x23, #0x0]\n"
       "tbz %x[flags], #1, 178f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
+      "ld1r { v25.8h }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v1.8h\n"
-      "fmin v9.8h, v9.8h, v1.8h\n"
-      "fmin v10.8h, v10.8h, v1.8h\n"
-      "fmin v11.8h, v11.8h, v1.8h\n"
-      "fmin v12.8h, v12.8h, v1.8h\n"
-      "fmin v13.8h, v13.8h, v1.8h\n"
-      "fmin v14.8h, v14.8h, v1.8h\n"
-      "fmin v15.8h, v15.8h, v1.8h\n"
-      "fmin v16.8h, v16.8h, v1.8h\n"
-      "fmin v17.8h, v17.8h, v1.8h\n"
-      "fmin v18.8h, v18.8h, v1.8h\n"
-      "fmin v19.8h, v19.8h, v1.8h\n"
-      "fmin v20.8h, v20.8h, v1.8h\n"
-      "fmin v21.8h, v21.8h, v1.8h\n"
-      "fmin v22.8h, v22.8h, v1.8h\n"
-      "fmin v23.8h, v23.8h, v1.8h\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v0.8h\n"
-      "fmax v13.8h, v13.8h, v0.8h\n"
-      "fmax v14.8h, v14.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v0.8h\n"
-      "fmax v16.8h, v16.8h, v0.8h\n"
-      "fmax v17.8h, v17.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v0.8h\n"
-      "fmax v19.8h, v19.8h, v0.8h\n"
-      "fmax v20.8h, v20.8h, v0.8h\n"
-      "fmax v21.8h, v21.8h, v0.8h\n"
-      "fmax v22.8h, v22.8h, v0.8h\n"
-      "fmax v23.8h, v23.8h, v0.8h\n"
+      "ld1r { v24.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v25.8h\n"
+      "fmin v9.8h, v9.8h, v25.8h\n"
+      "fmin v10.8h, v10.8h, v25.8h\n"
+      "fmin v11.8h, v11.8h, v25.8h\n"
+      "fmin v12.8h, v12.8h, v25.8h\n"
+      "fmin v13.8h, v13.8h, v25.8h\n"
+      "fmin v14.8h, v14.8h, v25.8h\n"
+      "fmin v15.8h, v15.8h, v25.8h\n"
+      "fmin v16.8h, v16.8h, v25.8h\n"
+      "fmin v17.8h, v17.8h, v25.8h\n"
+      "fmin v18.8h, v18.8h, v25.8h\n"
+      "fmin v19.8h, v19.8h, v25.8h\n"
+      "fmin v20.8h, v20.8h, v25.8h\n"
+      "fmin v21.8h, v21.8h, v25.8h\n"
+      "fmin v22.8h, v22.8h, v25.8h\n"
+      "fmin v23.8h, v23.8h, v25.8h\n"
+      "fmax v8.8h, v8.8h, v24.8h\n"
+      "fmax v9.8h, v9.8h, v24.8h\n"
+      "fmax v10.8h, v10.8h, v24.8h\n"
+      "fmax v11.8h, v11.8h, v24.8h\n"
+      "fmax v12.8h, v12.8h, v24.8h\n"
+      "fmax v13.8h, v13.8h, v24.8h\n"
+      "fmax v14.8h, v14.8h, v24.8h\n"
+      "fmax v15.8h, v15.8h, v24.8h\n"
+      "fmax v16.8h, v16.8h, v24.8h\n"
+      "fmax v17.8h, v17.8h, v24.8h\n"
+      "fmax v18.8h, v18.8h, v24.8h\n"
+      "fmax v19.8h, v19.8h, v24.8h\n"
+      "fmax v20.8h, v20.8h, v24.8h\n"
+      "fmax v21.8h, v21.8h, v24.8h\n"
+      "fmax v22.8h, v22.8h, v24.8h\n"
+      "fmax v23.8h, v23.8h, v24.8h\n"
       "178:"  // Height 4: No activation
       "cmp x11, #0x20\n"
       "bge 195f\n"
@@ -3217,15 +3217,15 @@
       "219:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 220f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 221f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -3236,10 +3236,10 @@
       "b 221f\n"
       "220:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "221:"  // Height 5: input setup done
       "cmp x27, #0x8\n"
       "blt 224f\n"
@@ -3262,7 +3262,7 @@
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q29, [x10, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "add x23, x23, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -3271,196 +3271,196 @@
       "cmp x27, #0x10\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q28, [x10, #0x30]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v10.8h, v29.8h, v0.h[0]\n"
+      "fmla v14.8h, v29.8h, v1.h[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "fmla v22.8h, v29.8h, v3.h[0]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "fmla v24.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "fmla v25.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "fmla v26.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "fmla v27.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "fmla v24.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "fmla v25.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "fmla v26.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "fmla v27.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "fmla v24.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "fmla v25.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "fmla v26.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "fmla v27.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "fmla v24.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "fmla v25.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "fmla v26.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "fmla v27.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "fmla v24.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "fmla v25.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "fmla v26.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "fmla v27.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "fmla v24.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "fmla v25.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "fmla v26.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "fmla v27.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "fmla v24.8h, v6.8h, v4.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "fmla v25.8h, v7.8h, v4.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "fmla v26.8h, v29.8h, v4.h[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      "fmla v11.8h, v28.8h, v0.h[0]\n"
+      "fmla v15.8h, v28.8h, v1.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v3.h[0]\n"
+      "fmla v27.8h, v28.8h, v4.h[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      "fmla v8.8h, v29.8h, v0.h[1]\n"
+      "fmla v12.8h, v29.8h, v1.h[1]\n"
+      "fmla v16.8h, v29.8h, v2.h[1]\n"
+      "fmla v20.8h, v29.8h, v3.h[1]\n"
+      "fmla v24.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      "fmla v9.8h, v28.8h, v0.h[1]\n"
+      "fmla v13.8h, v28.8h, v1.h[1]\n"
+      "fmla v17.8h, v28.8h, v2.h[1]\n"
+      "fmla v21.8h, v28.8h, v3.h[1]\n"
+      "fmla v25.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      "fmla v10.8h, v29.8h, v0.h[1]\n"
+      "fmla v14.8h, v29.8h, v1.h[1]\n"
+      "fmla v18.8h, v29.8h, v2.h[1]\n"
+      "fmla v22.8h, v29.8h, v3.h[1]\n"
+      "fmla v26.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      "fmla v11.8h, v28.8h, v0.h[1]\n"
+      "fmla v15.8h, v28.8h, v1.h[1]\n"
+      "fmla v19.8h, v28.8h, v2.h[1]\n"
+      "fmla v23.8h, v28.8h, v3.h[1]\n"
+      "fmla v27.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      "fmla v8.8h, v29.8h, v0.h[2]\n"
+      "fmla v12.8h, v29.8h, v1.h[2]\n"
+      "fmla v16.8h, v29.8h, v2.h[2]\n"
+      "fmla v20.8h, v29.8h, v3.h[2]\n"
+      "fmla v24.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      "fmla v9.8h, v28.8h, v0.h[2]\n"
+      "fmla v13.8h, v28.8h, v1.h[2]\n"
+      "fmla v17.8h, v28.8h, v2.h[2]\n"
+      "fmla v21.8h, v28.8h, v3.h[2]\n"
+      "fmla v25.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      "fmla v10.8h, v29.8h, v0.h[2]\n"
+      "fmla v14.8h, v29.8h, v1.h[2]\n"
+      "fmla v18.8h, v29.8h, v2.h[2]\n"
+      "fmla v22.8h, v29.8h, v3.h[2]\n"
+      "fmla v26.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      "fmla v11.8h, v28.8h, v0.h[2]\n"
+      "fmla v15.8h, v28.8h, v1.h[2]\n"
+      "fmla v19.8h, v28.8h, v2.h[2]\n"
+      "fmla v23.8h, v28.8h, v3.h[2]\n"
+      "fmla v27.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      "fmla v8.8h, v29.8h, v0.h[3]\n"
+      "fmla v12.8h, v29.8h, v1.h[3]\n"
+      "fmla v16.8h, v29.8h, v2.h[3]\n"
+      "fmla v20.8h, v29.8h, v3.h[3]\n"
+      "fmla v24.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      "fmla v9.8h, v28.8h, v0.h[3]\n"
+      "fmla v13.8h, v28.8h, v1.h[3]\n"
+      "fmla v17.8h, v28.8h, v2.h[3]\n"
+      "fmla v21.8h, v28.8h, v3.h[3]\n"
+      "fmla v25.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
+      "fmla v10.8h, v29.8h, v0.h[3]\n"
+      "fmla v14.8h, v29.8h, v1.h[3]\n"
+      "fmla v18.8h, v29.8h, v2.h[3]\n"
+      "fmla v22.8h, v29.8h, v3.h[3]\n"
+      "fmla v26.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0x100]\n"
+      "fmla v11.8h, v28.8h, v0.h[3]\n"
+      "fmla v15.8h, v28.8h, v1.h[3]\n"
+      "fmla v19.8h, v28.8h, v2.h[3]\n"
+      "fmla v23.8h, v28.8h, v3.h[3]\n"
+      "fmla v27.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x10, #0x110]\n"
+      "fmla v8.8h, v29.8h, v0.h[4]\n"
+      "fmla v12.8h, v29.8h, v1.h[4]\n"
+      "fmla v16.8h, v29.8h, v2.h[4]\n"
+      "fmla v20.8h, v29.8h, v3.h[4]\n"
+      "fmla v24.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x10, #0x120]\n"
+      "fmla v9.8h, v28.8h, v0.h[4]\n"
+      "fmla v13.8h, v28.8h, v1.h[4]\n"
+      "fmla v17.8h, v28.8h, v2.h[4]\n"
+      "fmla v21.8h, v28.8h, v3.h[4]\n"
+      "fmla v25.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x10, #0x130]\n"
+      "fmla v10.8h, v29.8h, v0.h[4]\n"
+      "fmla v14.8h, v29.8h, v1.h[4]\n"
+      "fmla v18.8h, v29.8h, v2.h[4]\n"
+      "fmla v22.8h, v29.8h, v3.h[4]\n"
+      "fmla v26.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x10, #0x140]\n"
+      "fmla v11.8h, v28.8h, v0.h[4]\n"
+      "fmla v15.8h, v28.8h, v1.h[4]\n"
+      "fmla v19.8h, v28.8h, v2.h[4]\n"
+      "fmla v23.8h, v28.8h, v3.h[4]\n"
+      "fmla v27.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x10, #0x150]\n"
+      "fmla v8.8h, v29.8h, v0.h[5]\n"
+      "fmla v12.8h, v29.8h, v1.h[5]\n"
+      "fmla v16.8h, v29.8h, v2.h[5]\n"
+      "fmla v20.8h, v29.8h, v3.h[5]\n"
+      "fmla v24.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x10, #0x160]\n"
+      "fmla v9.8h, v28.8h, v0.h[5]\n"
+      "fmla v13.8h, v28.8h, v1.h[5]\n"
+      "fmla v17.8h, v28.8h, v2.h[5]\n"
+      "fmla v21.8h, v28.8h, v3.h[5]\n"
+      "fmla v25.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x10, #0x170]\n"
+      "fmla v10.8h, v29.8h, v0.h[5]\n"
+      "fmla v14.8h, v29.8h, v1.h[5]\n"
+      "fmla v18.8h, v29.8h, v2.h[5]\n"
+      "fmla v22.8h, v29.8h, v3.h[5]\n"
+      "fmla v26.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x10, #0x180]\n"
+      "fmla v11.8h, v28.8h, v0.h[5]\n"
+      "fmla v15.8h, v28.8h, v1.h[5]\n"
+      "fmla v19.8h, v28.8h, v2.h[5]\n"
+      "fmla v23.8h, v28.8h, v3.h[5]\n"
+      "fmla v27.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x10, #0x190]\n"
+      "fmla v8.8h, v29.8h, v0.h[6]\n"
+      "fmla v12.8h, v29.8h, v1.h[6]\n"
+      "fmla v16.8h, v29.8h, v2.h[6]\n"
+      "fmla v20.8h, v29.8h, v3.h[6]\n"
+      "fmla v24.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x10, #0x1a0]\n"
+      "fmla v9.8h, v28.8h, v0.h[6]\n"
+      "fmla v13.8h, v28.8h, v1.h[6]\n"
+      "fmla v17.8h, v28.8h, v2.h[6]\n"
+      "fmla v21.8h, v28.8h, v3.h[6]\n"
+      "fmla v25.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x10, #0x1b0]\n"
+      "fmla v10.8h, v29.8h, v0.h[6]\n"
+      "fmla v14.8h, v29.8h, v1.h[6]\n"
+      "fmla v18.8h, v29.8h, v2.h[6]\n"
+      "fmla v22.8h, v29.8h, v3.h[6]\n"
+      "fmla v26.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x10, #0x1c0]\n"
+      "fmla v11.8h, v28.8h, v0.h[6]\n"
+      "fmla v15.8h, v28.8h, v1.h[6]\n"
+      "fmla v19.8h, v28.8h, v2.h[6]\n"
+      "fmla v23.8h, v28.8h, v3.h[6]\n"
+      "fmla v27.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x10, #0x1d0]\n"
+      "fmla v8.8h, v29.8h, v0.h[7]\n"
+      "fmla v12.8h, v29.8h, v1.h[7]\n"
+      "fmla v16.8h, v29.8h, v2.h[7]\n"
+      "fmla v20.8h, v29.8h, v3.h[7]\n"
+      "fmla v24.8h, v29.8h, v4.h[7]\n"
+      "ldr q29, [x10, #0x1e0]\n"
+      "fmla v9.8h, v28.8h, v0.h[7]\n"
+      "fmla v13.8h, v28.8h, v1.h[7]\n"
+      "fmla v17.8h, v28.8h, v2.h[7]\n"
+      "fmla v21.8h, v28.8h, v3.h[7]\n"
+      "fmla v25.8h, v28.8h, v4.h[7]\n"
+      "ldr q28, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v10.8h, v29.8h, v0.h[7]\n"
+      "fmla v14.8h, v29.8h, v1.h[7]\n"
+      "fmla v18.8h, v29.8h, v2.h[7]\n"
+      "fmla v22.8h, v29.8h, v3.h[7]\n"
+      "fmla v26.8h, v29.8h, v4.h[7]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v11.8h, v28.8h, v0.h[7]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v15.8h, v28.8h, v1.h[7]\n"
       "ldr q1, [x25, #0x0]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v19.8h, v28.8h, v2.h[7]\n"
       "ldr q2, [x24, #0x0]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v23.8h, v28.8h, v3.h[7]\n"
       "ldr q3, [x23, #0x0]\n"
-      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "fmla v27.8h, v28.8h, v4.h[7]\n"
       "ldr q4, [x22, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 222b\n"
@@ -3474,7 +3474,7 @@
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q29, [x10, #0x20]\n"
       "fmla v9.8h, v7.8h, v0.h[0]\n"
       "add x22, x22, #0x10\n"
       "fmla v13.8h, v7.8h, v1.h[0]\n"
@@ -3483,224 +3483,224 @@
       "prfm pldl1keep, [x26, #0x80]\n"
       "fmla v21.8h, v7.8h, v3.h[0]\n"
       "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q28, [x10, #0x30]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v10.8h, v29.8h, v0.h[0]\n"
+      "fmla v14.8h, v29.8h, v1.h[0]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "fmla v22.8h, v29.8h, v3.h[0]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.8h, v6.8h, v0.h[1]\n"
-      "fmla v12.8h, v6.8h, v1.h[1]\n"
-      "fmla v16.8h, v6.8h, v2.h[1]\n"
-      "fmla v20.8h, v6.8h, v3.h[1]\n"
-      "fmla v24.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.8h, v7.8h, v0.h[1]\n"
-      "fmla v13.8h, v7.8h, v1.h[1]\n"
-      "fmla v17.8h, v7.8h, v2.h[1]\n"
-      "fmla v21.8h, v7.8h, v3.h[1]\n"
-      "fmla v25.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.8h, v6.8h, v0.h[1]\n"
-      "fmla v14.8h, v6.8h, v1.h[1]\n"
-      "fmla v18.8h, v6.8h, v2.h[1]\n"
-      "fmla v22.8h, v6.8h, v3.h[1]\n"
-      "fmla v26.8h, v6.8h, v4.h[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.8h, v7.8h, v0.h[1]\n"
-      "fmla v15.8h, v7.8h, v1.h[1]\n"
-      "fmla v19.8h, v7.8h, v2.h[1]\n"
-      "fmla v23.8h, v7.8h, v3.h[1]\n"
-      "fmla v27.8h, v7.8h, v4.h[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.8h, v6.8h, v0.h[2]\n"
-      "fmla v12.8h, v6.8h, v1.h[2]\n"
-      "fmla v16.8h, v6.8h, v2.h[2]\n"
-      "fmla v20.8h, v6.8h, v3.h[2]\n"
-      "fmla v24.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.8h, v7.8h, v0.h[2]\n"
-      "fmla v13.8h, v7.8h, v1.h[2]\n"
-      "fmla v17.8h, v7.8h, v2.h[2]\n"
-      "fmla v21.8h, v7.8h, v3.h[2]\n"
-      "fmla v25.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.8h, v6.8h, v0.h[2]\n"
-      "fmla v14.8h, v6.8h, v1.h[2]\n"
-      "fmla v18.8h, v6.8h, v2.h[2]\n"
-      "fmla v22.8h, v6.8h, v3.h[2]\n"
-      "fmla v26.8h, v6.8h, v4.h[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.8h, v7.8h, v0.h[2]\n"
-      "fmla v15.8h, v7.8h, v1.h[2]\n"
-      "fmla v19.8h, v7.8h, v2.h[2]\n"
-      "fmla v23.8h, v7.8h, v3.h[2]\n"
-      "fmla v27.8h, v7.8h, v4.h[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.8h, v6.8h, v0.h[3]\n"
-      "fmla v12.8h, v6.8h, v1.h[3]\n"
-      "fmla v16.8h, v6.8h, v2.h[3]\n"
-      "fmla v20.8h, v6.8h, v3.h[3]\n"
-      "fmla v24.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.8h, v7.8h, v0.h[3]\n"
-      "fmla v13.8h, v7.8h, v1.h[3]\n"
-      "fmla v17.8h, v7.8h, v2.h[3]\n"
-      "fmla v21.8h, v7.8h, v3.h[3]\n"
-      "fmla v25.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
-      "fmla v10.8h, v6.8h, v0.h[3]\n"
-      "fmla v14.8h, v6.8h, v1.h[3]\n"
-      "fmla v18.8h, v6.8h, v2.h[3]\n"
-      "fmla v22.8h, v6.8h, v3.h[3]\n"
-      "fmla v26.8h, v6.8h, v4.h[3]\n"
-      "ldr q6, [x10, #0x100]\n"
-      "fmla v11.8h, v7.8h, v0.h[3]\n"
-      "fmla v15.8h, v7.8h, v1.h[3]\n"
-      "fmla v19.8h, v7.8h, v2.h[3]\n"
-      "fmla v23.8h, v7.8h, v3.h[3]\n"
-      "fmla v27.8h, v7.8h, v4.h[3]\n"
-      "ldr q7, [x10, #0x110]\n"
-      "fmla v8.8h, v6.8h, v0.h[4]\n"
-      "fmla v12.8h, v6.8h, v1.h[4]\n"
-      "fmla v16.8h, v6.8h, v2.h[4]\n"
-      "fmla v20.8h, v6.8h, v3.h[4]\n"
-      "fmla v24.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x10, #0x120]\n"
-      "fmla v9.8h, v7.8h, v0.h[4]\n"
-      "fmla v13.8h, v7.8h, v1.h[4]\n"
-      "fmla v17.8h, v7.8h, v2.h[4]\n"
-      "fmla v21.8h, v7.8h, v3.h[4]\n"
-      "fmla v25.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x10, #0x130]\n"
-      "fmla v10.8h, v6.8h, v0.h[4]\n"
-      "fmla v14.8h, v6.8h, v1.h[4]\n"
-      "fmla v18.8h, v6.8h, v2.h[4]\n"
-      "fmla v22.8h, v6.8h, v3.h[4]\n"
-      "fmla v26.8h, v6.8h, v4.h[4]\n"
-      "ldr q6, [x10, #0x140]\n"
-      "fmla v11.8h, v7.8h, v0.h[4]\n"
-      "fmla v15.8h, v7.8h, v1.h[4]\n"
-      "fmla v19.8h, v7.8h, v2.h[4]\n"
-      "fmla v23.8h, v7.8h, v3.h[4]\n"
-      "fmla v27.8h, v7.8h, v4.h[4]\n"
-      "ldr q7, [x10, #0x150]\n"
-      "fmla v8.8h, v6.8h, v0.h[5]\n"
-      "fmla v12.8h, v6.8h, v1.h[5]\n"
-      "fmla v16.8h, v6.8h, v2.h[5]\n"
-      "fmla v20.8h, v6.8h, v3.h[5]\n"
-      "fmla v24.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x10, #0x160]\n"
-      "fmla v9.8h, v7.8h, v0.h[5]\n"
-      "fmla v13.8h, v7.8h, v1.h[5]\n"
-      "fmla v17.8h, v7.8h, v2.h[5]\n"
-      "fmla v21.8h, v7.8h, v3.h[5]\n"
-      "fmla v25.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x10, #0x170]\n"
-      "fmla v10.8h, v6.8h, v0.h[5]\n"
-      "fmla v14.8h, v6.8h, v1.h[5]\n"
-      "fmla v18.8h, v6.8h, v2.h[5]\n"
-      "fmla v22.8h, v6.8h, v3.h[5]\n"
-      "fmla v26.8h, v6.8h, v4.h[5]\n"
-      "ldr q6, [x10, #0x180]\n"
-      "fmla v11.8h, v7.8h, v0.h[5]\n"
-      "fmla v15.8h, v7.8h, v1.h[5]\n"
-      "fmla v19.8h, v7.8h, v2.h[5]\n"
-      "fmla v23.8h, v7.8h, v3.h[5]\n"
-      "fmla v27.8h, v7.8h, v4.h[5]\n"
-      "ldr q7, [x10, #0x190]\n"
-      "fmla v8.8h, v6.8h, v0.h[6]\n"
-      "fmla v12.8h, v6.8h, v1.h[6]\n"
-      "fmla v16.8h, v6.8h, v2.h[6]\n"
-      "fmla v20.8h, v6.8h, v3.h[6]\n"
-      "fmla v24.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x10, #0x1a0]\n"
-      "fmla v9.8h, v7.8h, v0.h[6]\n"
-      "fmla v13.8h, v7.8h, v1.h[6]\n"
-      "fmla v17.8h, v7.8h, v2.h[6]\n"
-      "fmla v21.8h, v7.8h, v3.h[6]\n"
-      "fmla v25.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x10, #0x1b0]\n"
-      "fmla v10.8h, v6.8h, v0.h[6]\n"
-      "fmla v14.8h, v6.8h, v1.h[6]\n"
-      "fmla v18.8h, v6.8h, v2.h[6]\n"
-      "fmla v22.8h, v6.8h, v3.h[6]\n"
-      "fmla v26.8h, v6.8h, v4.h[6]\n"
-      "ldr q6, [x10, #0x1c0]\n"
-      "fmla v11.8h, v7.8h, v0.h[6]\n"
-      "fmla v15.8h, v7.8h, v1.h[6]\n"
-      "fmla v19.8h, v7.8h, v2.h[6]\n"
-      "fmla v23.8h, v7.8h, v3.h[6]\n"
-      "fmla v27.8h, v7.8h, v4.h[6]\n"
-      "ldr q7, [x10, #0x1d0]\n"
-      "fmla v8.8h, v6.8h, v0.h[7]\n"
-      "fmla v12.8h, v6.8h, v1.h[7]\n"
-      "fmla v16.8h, v6.8h, v2.h[7]\n"
-      "fmla v20.8h, v6.8h, v3.h[7]\n"
-      "fmla v24.8h, v6.8h, v4.h[7]\n"
-      "ldr q6, [x10, #0x1e0]\n"
-      "fmla v9.8h, v7.8h, v0.h[7]\n"
-      "fmla v13.8h, v7.8h, v1.h[7]\n"
-      "fmla v17.8h, v7.8h, v2.h[7]\n"
-      "fmla v21.8h, v7.8h, v3.h[7]\n"
-      "fmla v25.8h, v7.8h, v4.h[7]\n"
-      "ldr q7, [x10, #0x1f0]\n"
+      "fmla v26.8h, v29.8h, v4.h[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      "fmla v11.8h, v28.8h, v0.h[0]\n"
+      "fmla v15.8h, v28.8h, v1.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v3.h[0]\n"
+      "fmla v27.8h, v28.8h, v4.h[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      "fmla v8.8h, v29.8h, v0.h[1]\n"
+      "fmla v12.8h, v29.8h, v1.h[1]\n"
+      "fmla v16.8h, v29.8h, v2.h[1]\n"
+      "fmla v20.8h, v29.8h, v3.h[1]\n"
+      "fmla v24.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      "fmla v9.8h, v28.8h, v0.h[1]\n"
+      "fmla v13.8h, v28.8h, v1.h[1]\n"
+      "fmla v17.8h, v28.8h, v2.h[1]\n"
+      "fmla v21.8h, v28.8h, v3.h[1]\n"
+      "fmla v25.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      "fmla v10.8h, v29.8h, v0.h[1]\n"
+      "fmla v14.8h, v29.8h, v1.h[1]\n"
+      "fmla v18.8h, v29.8h, v2.h[1]\n"
+      "fmla v22.8h, v29.8h, v3.h[1]\n"
+      "fmla v26.8h, v29.8h, v4.h[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      "fmla v11.8h, v28.8h, v0.h[1]\n"
+      "fmla v15.8h, v28.8h, v1.h[1]\n"
+      "fmla v19.8h, v28.8h, v2.h[1]\n"
+      "fmla v23.8h, v28.8h, v3.h[1]\n"
+      "fmla v27.8h, v28.8h, v4.h[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      "fmla v8.8h, v29.8h, v0.h[2]\n"
+      "fmla v12.8h, v29.8h, v1.h[2]\n"
+      "fmla v16.8h, v29.8h, v2.h[2]\n"
+      "fmla v20.8h, v29.8h, v3.h[2]\n"
+      "fmla v24.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      "fmla v9.8h, v28.8h, v0.h[2]\n"
+      "fmla v13.8h, v28.8h, v1.h[2]\n"
+      "fmla v17.8h, v28.8h, v2.h[2]\n"
+      "fmla v21.8h, v28.8h, v3.h[2]\n"
+      "fmla v25.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      "fmla v10.8h, v29.8h, v0.h[2]\n"
+      "fmla v14.8h, v29.8h, v1.h[2]\n"
+      "fmla v18.8h, v29.8h, v2.h[2]\n"
+      "fmla v22.8h, v29.8h, v3.h[2]\n"
+      "fmla v26.8h, v29.8h, v4.h[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      "fmla v11.8h, v28.8h, v0.h[2]\n"
+      "fmla v15.8h, v28.8h, v1.h[2]\n"
+      "fmla v19.8h, v28.8h, v2.h[2]\n"
+      "fmla v23.8h, v28.8h, v3.h[2]\n"
+      "fmla v27.8h, v28.8h, v4.h[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      "fmla v8.8h, v29.8h, v0.h[3]\n"
+      "fmla v12.8h, v29.8h, v1.h[3]\n"
+      "fmla v16.8h, v29.8h, v2.h[3]\n"
+      "fmla v20.8h, v29.8h, v3.h[3]\n"
+      "fmla v24.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      "fmla v9.8h, v28.8h, v0.h[3]\n"
+      "fmla v13.8h, v28.8h, v1.h[3]\n"
+      "fmla v17.8h, v28.8h, v2.h[3]\n"
+      "fmla v21.8h, v28.8h, v3.h[3]\n"
+      "fmla v25.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
+      "fmla v10.8h, v29.8h, v0.h[3]\n"
+      "fmla v14.8h, v29.8h, v1.h[3]\n"
+      "fmla v18.8h, v29.8h, v2.h[3]\n"
+      "fmla v22.8h, v29.8h, v3.h[3]\n"
+      "fmla v26.8h, v29.8h, v4.h[3]\n"
+      "ldr q29, [x10, #0x100]\n"
+      "fmla v11.8h, v28.8h, v0.h[3]\n"
+      "fmla v15.8h, v28.8h, v1.h[3]\n"
+      "fmla v19.8h, v28.8h, v2.h[3]\n"
+      "fmla v23.8h, v28.8h, v3.h[3]\n"
+      "fmla v27.8h, v28.8h, v4.h[3]\n"
+      "ldr q28, [x10, #0x110]\n"
+      "fmla v8.8h, v29.8h, v0.h[4]\n"
+      "fmla v12.8h, v29.8h, v1.h[4]\n"
+      "fmla v16.8h, v29.8h, v2.h[4]\n"
+      "fmla v20.8h, v29.8h, v3.h[4]\n"
+      "fmla v24.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x10, #0x120]\n"
+      "fmla v9.8h, v28.8h, v0.h[4]\n"
+      "fmla v13.8h, v28.8h, v1.h[4]\n"
+      "fmla v17.8h, v28.8h, v2.h[4]\n"
+      "fmla v21.8h, v28.8h, v3.h[4]\n"
+      "fmla v25.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x10, #0x130]\n"
+      "fmla v10.8h, v29.8h, v0.h[4]\n"
+      "fmla v14.8h, v29.8h, v1.h[4]\n"
+      "fmla v18.8h, v29.8h, v2.h[4]\n"
+      "fmla v22.8h, v29.8h, v3.h[4]\n"
+      "fmla v26.8h, v29.8h, v4.h[4]\n"
+      "ldr q29, [x10, #0x140]\n"
+      "fmla v11.8h, v28.8h, v0.h[4]\n"
+      "fmla v15.8h, v28.8h, v1.h[4]\n"
+      "fmla v19.8h, v28.8h, v2.h[4]\n"
+      "fmla v23.8h, v28.8h, v3.h[4]\n"
+      "fmla v27.8h, v28.8h, v4.h[4]\n"
+      "ldr q28, [x10, #0x150]\n"
+      "fmla v8.8h, v29.8h, v0.h[5]\n"
+      "fmla v12.8h, v29.8h, v1.h[5]\n"
+      "fmla v16.8h, v29.8h, v2.h[5]\n"
+      "fmla v20.8h, v29.8h, v3.h[5]\n"
+      "fmla v24.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x10, #0x160]\n"
+      "fmla v9.8h, v28.8h, v0.h[5]\n"
+      "fmla v13.8h, v28.8h, v1.h[5]\n"
+      "fmla v17.8h, v28.8h, v2.h[5]\n"
+      "fmla v21.8h, v28.8h, v3.h[5]\n"
+      "fmla v25.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x10, #0x170]\n"
+      "fmla v10.8h, v29.8h, v0.h[5]\n"
+      "fmla v14.8h, v29.8h, v1.h[5]\n"
+      "fmla v18.8h, v29.8h, v2.h[5]\n"
+      "fmla v22.8h, v29.8h, v3.h[5]\n"
+      "fmla v26.8h, v29.8h, v4.h[5]\n"
+      "ldr q29, [x10, #0x180]\n"
+      "fmla v11.8h, v28.8h, v0.h[5]\n"
+      "fmla v15.8h, v28.8h, v1.h[5]\n"
+      "fmla v19.8h, v28.8h, v2.h[5]\n"
+      "fmla v23.8h, v28.8h, v3.h[5]\n"
+      "fmla v27.8h, v28.8h, v4.h[5]\n"
+      "ldr q28, [x10, #0x190]\n"
+      "fmla v8.8h, v29.8h, v0.h[6]\n"
+      "fmla v12.8h, v29.8h, v1.h[6]\n"
+      "fmla v16.8h, v29.8h, v2.h[6]\n"
+      "fmla v20.8h, v29.8h, v3.h[6]\n"
+      "fmla v24.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x10, #0x1a0]\n"
+      "fmla v9.8h, v28.8h, v0.h[6]\n"
+      "fmla v13.8h, v28.8h, v1.h[6]\n"
+      "fmla v17.8h, v28.8h, v2.h[6]\n"
+      "fmla v21.8h, v28.8h, v3.h[6]\n"
+      "fmla v25.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x10, #0x1b0]\n"
+      "fmla v10.8h, v29.8h, v0.h[6]\n"
+      "fmla v14.8h, v29.8h, v1.h[6]\n"
+      "fmla v18.8h, v29.8h, v2.h[6]\n"
+      "fmla v22.8h, v29.8h, v3.h[6]\n"
+      "fmla v26.8h, v29.8h, v4.h[6]\n"
+      "ldr q29, [x10, #0x1c0]\n"
+      "fmla v11.8h, v28.8h, v0.h[6]\n"
+      "fmla v15.8h, v28.8h, v1.h[6]\n"
+      "fmla v19.8h, v28.8h, v2.h[6]\n"
+      "fmla v23.8h, v28.8h, v3.h[6]\n"
+      "fmla v27.8h, v28.8h, v4.h[6]\n"
+      "ldr q28, [x10, #0x1d0]\n"
+      "fmla v8.8h, v29.8h, v0.h[7]\n"
+      "fmla v12.8h, v29.8h, v1.h[7]\n"
+      "fmla v16.8h, v29.8h, v2.h[7]\n"
+      "fmla v20.8h, v29.8h, v3.h[7]\n"
+      "fmla v24.8h, v29.8h, v4.h[7]\n"
+      "ldr q29, [x10, #0x1e0]\n"
+      "fmla v9.8h, v28.8h, v0.h[7]\n"
+      "fmla v13.8h, v28.8h, v1.h[7]\n"
+      "fmla v17.8h, v28.8h, v2.h[7]\n"
+      "fmla v21.8h, v28.8h, v3.h[7]\n"
+      "fmla v25.8h, v28.8h, v4.h[7]\n"
+      "ldr q28, [x10, #0x1f0]\n"
       "add x10, x10, #0x200\n"
-      "fmla v10.8h, v6.8h, v0.h[7]\n"
-      "fmla v14.8h, v6.8h, v1.h[7]\n"
-      "fmla v18.8h, v6.8h, v2.h[7]\n"
-      "fmla v22.8h, v6.8h, v3.h[7]\n"
-      "fmla v26.8h, v6.8h, v4.h[7]\n"
-      "fmla v11.8h, v7.8h, v0.h[7]\n"
-      "fmla v15.8h, v7.8h, v1.h[7]\n"
-      "fmla v19.8h, v7.8h, v2.h[7]\n"
-      "fmla v23.8h, v7.8h, v3.h[7]\n"
-      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "fmla v10.8h, v29.8h, v0.h[7]\n"
+      "fmla v14.8h, v29.8h, v1.h[7]\n"
+      "fmla v18.8h, v29.8h, v2.h[7]\n"
+      "fmla v22.8h, v29.8h, v3.h[7]\n"
+      "fmla v26.8h, v29.8h, v4.h[7]\n"
+      "fmla v11.8h, v28.8h, v0.h[7]\n"
+      "fmla v15.8h, v28.8h, v1.h[7]\n"
+      "fmla v19.8h, v28.8h, v2.h[7]\n"
+      "fmla v23.8h, v28.8h, v3.h[7]\n"
+      "fmla v27.8h, v28.8h, v4.h[7]\n"
       "224:"  // Height 5: Multiply loop: Main loop skip
       "cbz x27, 226f\n"
       "225:"  // Height 5: Multiply loop: Odd block loop
-      "ldr h0, [x26], #0x2\n"
-      "ldr h1, [x25], #0x2\n"
+      "ldr h4, [x26], #0x2\n"
+      "ldr h3, [x25], #0x2\n"
       "sub x27, x27, #0x1\n"
       "ldr h2, [x24], #0x2\n"
-      "ldr h3, [x23], #0x2\n"
-      "ldr h4, [x22], #0x2\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr h1, [x23], #0x2\n"
+      "ldr h0, [x22], #0x2\n"
+      "ldr q29, [x10, #0x0]\n"
+      "fmla v8.8h, v29.8h, v4.h[0]\n"
+      "fmla v12.8h, v29.8h, v3.h[0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      "fmla v16.8h, v29.8h, v2.h[0]\n"
+      "fmla v20.8h, v29.8h, v1.h[0]\n"
+      "fmla v24.8h, v29.8h, v0.h[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      "fmla v9.8h, v28.8h, v4.h[0]\n"
+      "fmla v13.8h, v28.8h, v3.h[0]\n"
+      "fmla v17.8h, v28.8h, v2.h[0]\n"
+      "fmla v21.8h, v28.8h, v1.h[0]\n"
+      "fmla v25.8h, v28.8h, v0.h[0]\n"
+      "ldr q28, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "fmla v10.8h, v29.8h, v4.h[0]\n"
+      "fmla v14.8h, v29.8h, v3.h[0]\n"
+      "fmla v18.8h, v29.8h, v2.h[0]\n"
+      "fmla v22.8h, v29.8h, v1.h[0]\n"
+      "fmla v26.8h, v29.8h, v0.h[0]\n"
+      "fmla v11.8h, v28.8h, v4.h[0]\n"
+      "fmla v15.8h, v28.8h, v3.h[0]\n"
+      "fmla v19.8h, v28.8h, v2.h[0]\n"
+      "fmla v23.8h, v28.8h, v1.h[0]\n"
+      "fmla v27.8h, v28.8h, v0.h[0]\n"
       "cbnz x27, 225b\n"
       "226:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -3719,49 +3719,49 @@
       "prfm pstl1keep, [x22, #0x0]\n"
       "tbz %x[flags], #1, 227f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.8h }, [x20]\n"
+      "ld1r { v29.8h }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.8h }, [x20]\n"
-      "fmin v8.8h, v8.8h, v1.8h\n"
-      "fmin v9.8h, v9.8h, v1.8h\n"
-      "fmin v10.8h, v10.8h, v1.8h\n"
-      "fmin v11.8h, v11.8h, v1.8h\n"
-      "fmin v12.8h, v12.8h, v1.8h\n"
-      "fmin v13.8h, v13.8h, v1.8h\n"
-      "fmin v14.8h, v14.8h, v1.8h\n"
-      "fmin v15.8h, v15.8h, v1.8h\n"
-      "fmin v16.8h, v16.8h, v1.8h\n"
-      "fmin v17.8h, v17.8h, v1.8h\n"
-      "fmin v18.8h, v18.8h, v1.8h\n"
-      "fmin v19.8h, v19.8h, v1.8h\n"
-      "fmin v20.8h, v20.8h, v1.8h\n"
-      "fmin v21.8h, v21.8h, v1.8h\n"
-      "fmin v22.8h, v22.8h, v1.8h\n"
-      "fmin v23.8h, v23.8h, v1.8h\n"
-      "fmin v24.8h, v24.8h, v1.8h\n"
-      "fmin v25.8h, v25.8h, v1.8h\n"
-      "fmin v26.8h, v26.8h, v1.8h\n"
-      "fmin v27.8h, v27.8h, v1.8h\n"
-      "fmax v8.8h, v8.8h, v0.8h\n"
-      "fmax v9.8h, v9.8h, v0.8h\n"
-      "fmax v10.8h, v10.8h, v0.8h\n"
-      "fmax v11.8h, v11.8h, v0.8h\n"
-      "fmax v12.8h, v12.8h, v0.8h\n"
-      "fmax v13.8h, v13.8h, v0.8h\n"
-      "fmax v14.8h, v14.8h, v0.8h\n"
-      "fmax v15.8h, v15.8h, v0.8h\n"
-      "fmax v16.8h, v16.8h, v0.8h\n"
-      "fmax v17.8h, v17.8h, v0.8h\n"
-      "fmax v18.8h, v18.8h, v0.8h\n"
-      "fmax v19.8h, v19.8h, v0.8h\n"
-      "fmax v20.8h, v20.8h, v0.8h\n"
-      "fmax v21.8h, v21.8h, v0.8h\n"
-      "fmax v22.8h, v22.8h, v0.8h\n"
-      "fmax v23.8h, v23.8h, v0.8h\n"
-      "fmax v24.8h, v24.8h, v0.8h\n"
-      "fmax v25.8h, v25.8h, v0.8h\n"
-      "fmax v26.8h, v26.8h, v0.8h\n"
-      "fmax v27.8h, v27.8h, v0.8h\n"
+      "ld1r { v28.8h }, [x20]\n"
+      "fmin v8.8h, v8.8h, v29.8h\n"
+      "fmin v9.8h, v9.8h, v29.8h\n"
+      "fmin v10.8h, v10.8h, v29.8h\n"
+      "fmin v11.8h, v11.8h, v29.8h\n"
+      "fmin v12.8h, v12.8h, v29.8h\n"
+      "fmin v13.8h, v13.8h, v29.8h\n"
+      "fmin v14.8h, v14.8h, v29.8h\n"
+      "fmin v15.8h, v15.8h, v29.8h\n"
+      "fmin v16.8h, v16.8h, v29.8h\n"
+      "fmin v17.8h, v17.8h, v29.8h\n"
+      "fmin v18.8h, v18.8h, v29.8h\n"
+      "fmin v19.8h, v19.8h, v29.8h\n"
+      "fmin v20.8h, v20.8h, v29.8h\n"
+      "fmin v21.8h, v21.8h, v29.8h\n"
+      "fmin v22.8h, v22.8h, v29.8h\n"
+      "fmin v23.8h, v23.8h, v29.8h\n"
+      "fmin v24.8h, v24.8h, v29.8h\n"
+      "fmin v25.8h, v25.8h, v29.8h\n"
+      "fmin v26.8h, v26.8h, v29.8h\n"
+      "fmin v27.8h, v27.8h, v29.8h\n"
+      "fmax v8.8h, v8.8h, v28.8h\n"
+      "fmax v9.8h, v9.8h, v28.8h\n"
+      "fmax v10.8h, v10.8h, v28.8h\n"
+      "fmax v11.8h, v11.8h, v28.8h\n"
+      "fmax v12.8h, v12.8h, v28.8h\n"
+      "fmax v13.8h, v13.8h, v28.8h\n"
+      "fmax v14.8h, v14.8h, v28.8h\n"
+      "fmax v15.8h, v15.8h, v28.8h\n"
+      "fmax v16.8h, v16.8h, v28.8h\n"
+      "fmax v17.8h, v17.8h, v28.8h\n"
+      "fmax v18.8h, v18.8h, v28.8h\n"
+      "fmax v19.8h, v19.8h, v28.8h\n"
+      "fmax v20.8h, v20.8h, v28.8h\n"
+      "fmax v21.8h, v21.8h, v28.8h\n"
+      "fmax v22.8h, v22.8h, v28.8h\n"
+      "fmax v23.8h, v23.8h, v28.8h\n"
+      "fmax v24.8h, v24.8h, v28.8h\n"
+      "fmax v25.8h, v25.8h, v28.8h\n"
+      "fmax v26.8h, v26.8h, v28.8h\n"
+      "fmax v27.8h, v27.8h, v28.8h\n"
       "227:"  // Height 5: No activation
       "cmp x11, #0x20\n"
       "bge 244f\n"
@@ -4386,16 +4386,16 @@
       "268:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 269f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 270f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -4407,11 +4407,11 @@
       "b 270f\n"
       "269:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "270:"  // Height 6: input setup done
       "cmp x27, #0x8\n"
       "blt 273f\n"
@@ -4912,42 +4912,42 @@
       "273:"  // Height 6: Multiply loop: Main loop skip
       "cbz x27, 275f\n"
       "274:"  // Height 6: Multiply loop: Odd block loop
-      "ldr h0, [x26], #0x2\n"
-      "ldr h1, [x25], #0x2\n"
+      "ldr h7, [x26], #0x2\n"
+      "ldr h6, [x25], #0x2\n"
       "sub x27, x27, #0x1\n"
-      "ldr h2, [x24], #0x2\n"
-      "ldr h3, [x23], #0x2\n"
-      "ldr h4, [x22], #0x2\n"
-      "ldr h5, [x21], #0x2\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v8.8h, v6.8h, v0.h[0]\n"
-      "fmla v12.8h, v6.8h, v1.h[0]\n"
-      "fmla v16.8h, v6.8h, v2.h[0]\n"
-      "fmla v20.8h, v6.8h, v3.h[0]\n"
-      "fmla v24.8h, v6.8h, v4.h[0]\n"
-      "fmla v28.8h, v6.8h, v5.h[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.8h, v7.8h, v0.h[0]\n"
-      "fmla v13.8h, v7.8h, v1.h[0]\n"
-      "fmla v17.8h, v7.8h, v2.h[0]\n"
-      "fmla v21.8h, v7.8h, v3.h[0]\n"
-      "fmla v25.8h, v7.8h, v4.h[0]\n"
-      "fmla v29.8h, v7.8h, v5.h[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr h5, [x24], #0x2\n"
+      "ldr h4, [x23], #0x2\n"
+      "ldr h3, [x22], #0x2\n"
+      "ldr h2, [x21], #0x2\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      "fmla v8.8h, v1.8h, v7.h[0]\n"
+      "fmla v12.8h, v1.8h, v6.h[0]\n"
+      "fmla v16.8h, v1.8h, v5.h[0]\n"
+      "fmla v20.8h, v1.8h, v4.h[0]\n"
+      "fmla v24.8h, v1.8h, v3.h[0]\n"
+      "fmla v28.8h, v1.8h, v2.h[0]\n"
+      "ldr q1, [x10, #0x20]\n"
+      "fmla v9.8h, v0.8h, v7.h[0]\n"
+      "fmla v13.8h, v0.8h, v6.h[0]\n"
+      "fmla v17.8h, v0.8h, v5.h[0]\n"
+      "fmla v21.8h, v0.8h, v4.h[0]\n"
+      "fmla v25.8h, v0.8h, v3.h[0]\n"
+      "fmla v29.8h, v0.8h, v2.h[0]\n"
+      "ldr q0, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.8h, v6.8h, v0.h[0]\n"
-      "fmla v14.8h, v6.8h, v1.h[0]\n"
-      "fmla v18.8h, v6.8h, v2.h[0]\n"
-      "fmla v22.8h, v6.8h, v3.h[0]\n"
-      "fmla v26.8h, v6.8h, v4.h[0]\n"
-      "fmla v30.8h, v6.8h, v5.h[0]\n"
-      "fmla v11.8h, v7.8h, v0.h[0]\n"
-      "fmla v15.8h, v7.8h, v1.h[0]\n"
-      "fmla v19.8h, v7.8h, v2.h[0]\n"
-      "fmla v23.8h, v7.8h, v3.h[0]\n"
-      "fmla v27.8h, v7.8h, v4.h[0]\n"
-      "fmla v31.8h, v7.8h, v5.h[0]\n"
+      "fmla v10.8h, v1.8h, v7.h[0]\n"
+      "fmla v14.8h, v1.8h, v6.h[0]\n"
+      "fmla v18.8h, v1.8h, v5.h[0]\n"
+      "fmla v22.8h, v1.8h, v4.h[0]\n"
+      "fmla v26.8h, v1.8h, v3.h[0]\n"
+      "fmla v30.8h, v1.8h, v2.h[0]\n"
+      "fmla v11.8h, v0.8h, v7.h[0]\n"
+      "fmla v15.8h, v0.8h, v6.h[0]\n"
+      "fmla v19.8h, v0.8h, v5.h[0]\n"
+      "fmla v23.8h, v0.8h, v4.h[0]\n"
+      "fmla v27.8h, v0.8h, v3.h[0]\n"
+      "fmla v31.8h, v0.8h, v2.h[0]\n"
       "cbnz x27, 274b\n"
       "275:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -5317,7 +5317,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "296:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp
index e155bfb..171929e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -113,5 +113,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp
index 700d803..9ceda8f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp
@@ -92,7 +92,6 @@
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x4\n"
       "bge 124f\n"
@@ -223,11 +222,11 @@
       "19:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w12, [x20, x13, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x21, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x11, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x11, [x20, #0x0]\n"
       "cbnz x13, 21f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x11, x11, x20, LSL #2\n"
@@ -246,176 +245,176 @@
       "blt 23f\n"
       "22:"  // Height 1: Multiply loop: Main loop head
       "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "ldr d4, [x15, #0x40]\n"
-      "ldr x10, [x15, #0x48]\n"
+      "ldr d19, [x15, #0x40]\n"
+      "ldr x20, [x15, #0x48]\n"
       "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "ldr d5, [x15, #0x50]\n"
+      "ldr d18, [x15, #0x50]\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr d6, [x15, #0x60]\n"
+      "ldr d17, [x15, #0x60]\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr d7, [x15, #0x70]\n"
-      "mov v4.d[1], x10\n"
-      "ldr x9, [x15, #0x58]\n"
-      "mov v5.d[1], x9\n"
-      "ldr x28, [x15, #0x68]\n"
-      "mov v6.d[1], x28\n"
-      "ldr x27, [x15, #0x78]\n"
-      "mov v7.d[1], x27\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "ldr d4, [x15, #0x80]\n"
-      "ldr x10, [x15, #0x88]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "ldr d5, [x15, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr d6, [x15, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr d7, [x15, #0xb0]\n"
-      "mov v4.d[1], x10\n"
-      "ldr x9, [x15, #0x98]\n"
-      "mov v5.d[1], x9\n"
-      "ldr x28, [x15, #0xa8]\n"
-      "mov v6.d[1], x28\n"
-      "ldr x27, [x15, #0xb8]\n"
-      "mov v7.d[1], x27\n"
-      "fmla v10.4s, v4.4s, v0.s[1]\n"
-      "ldr d4, [x15, #0xc0]\n"
-      "ldr x10, [x15, #0xc8]\n"
-      "fmla v11.4s, v5.4s, v0.s[1]\n"
-      "ldr d5, [x15, #0xd0]\n"
-      "fmla v12.4s, v6.4s, v0.s[1]\n"
-      "ldr d6, [x15, #0xe0]\n"
-      "fmla v13.4s, v7.4s, v0.s[1]\n"
-      "ldr d7, [x15, #0xf0]\n"
-      "mov v4.d[1], x10\n"
-      "ldr x9, [x15, #0xd8]\n"
-      "mov v5.d[1], x9\n"
-      "ldr x28, [x15, #0xe8]\n"
-      "mov v6.d[1], x28\n"
-      "ldr x27, [x15, #0xf8]\n"
-      "mov v7.d[1], x27\n"
-      "fmla v8.4s, v4.4s, v0.s[2]\n"
-      "ldr d4, [x15, #0x100]\n"
-      "ldr x10, [x15, #0x108]\n"
-      "fmla v9.4s, v5.4s, v0.s[2]\n"
-      "ldr d5, [x15, #0x110]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr d6, [x15, #0x120]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr d7, [x15, #0x130]\n"
-      "mov v4.d[1], x10\n"
-      "ldr x9, [x15, #0x118]\n"
-      "mov v5.d[1], x9\n"
-      "ldr x28, [x15, #0x128]\n"
-      "mov v6.d[1], x28\n"
-      "ldr x27, [x15, #0x138]\n"
-      "mov v7.d[1], x27\n"
-      "fmla v12.4s, v4.4s, v0.s[2]\n"
-      "ldr d4, [x15, #0x140]\n"
-      "ldr x10, [x15, #0x148]\n"
-      "fmla v13.4s, v5.4s, v0.s[2]\n"
-      "ldr d5, [x15, #0x150]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr d6, [x15, #0x160]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr d7, [x15, #0x170]\n"
-      "mov v4.d[1], x10\n"
-      "ldr x9, [x15, #0x158]\n"
-      "mov v5.d[1], x9\n"
-      "ldr x28, [x15, #0x168]\n"
-      "mov v6.d[1], x28\n"
-      "ldr x27, [x15, #0x178]\n"
-      "mov v7.d[1], x27\n"
+      "ldr d16, [x15, #0x70]\n"
+      "mov v19.d[1], x20\n"
+      "ldr x20, [x15, #0x58]\n"
+      "mov v18.d[1], x20\n"
+      "ldr x20, [x15, #0x68]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x78]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v12.4s, v19.4s, v0.s[0]\n"
+      "ldr d19, [x15, #0x80]\n"
+      "ldr x20, [x15, #0x88]\n"
+      "fmla v13.4s, v18.4s, v0.s[0]\n"
+      "ldr d18, [x15, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr d17, [x15, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr d16, [x15, #0xb0]\n"
+      "mov v19.d[1], x20\n"
+      "ldr x20, [x15, #0x98]\n"
+      "mov v18.d[1], x20\n"
+      "ldr x20, [x15, #0xa8]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0xb8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.4s, v19.4s, v0.s[1]\n"
+      "ldr d19, [x15, #0xc0]\n"
+      "ldr x20, [x15, #0xc8]\n"
+      "fmla v11.4s, v18.4s, v0.s[1]\n"
+      "ldr d18, [x15, #0xd0]\n"
+      "fmla v12.4s, v17.4s, v0.s[1]\n"
+      "ldr d17, [x15, #0xe0]\n"
+      "fmla v13.4s, v16.4s, v0.s[1]\n"
+      "ldr d16, [x15, #0xf0]\n"
+      "mov v19.d[1], x20\n"
+      "ldr x20, [x15, #0xd8]\n"
+      "mov v18.d[1], x20\n"
+      "ldr x20, [x15, #0xe8]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0xf8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.4s, v19.4s, v0.s[2]\n"
+      "ldr d19, [x15, #0x100]\n"
+      "ldr x20, [x15, #0x108]\n"
+      "fmla v9.4s, v18.4s, v0.s[2]\n"
+      "ldr d18, [x15, #0x110]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr d17, [x15, #0x120]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr d16, [x15, #0x130]\n"
+      "mov v19.d[1], x20\n"
+      "ldr x20, [x15, #0x118]\n"
+      "mov v18.d[1], x20\n"
+      "ldr x20, [x15, #0x128]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x138]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v12.4s, v19.4s, v0.s[2]\n"
+      "ldr d19, [x15, #0x140]\n"
+      "ldr x20, [x15, #0x148]\n"
+      "fmla v13.4s, v18.4s, v0.s[2]\n"
+      "ldr d18, [x15, #0x150]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr d17, [x15, #0x160]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr d16, [x15, #0x170]\n"
+      "mov v19.d[1], x20\n"
+      "ldr x20, [x15, #0x158]\n"
+      "mov v18.d[1], x20\n"
+      "ldr x20, [x15, #0x168]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x178]\n"
+      "mov v16.d[1], x20\n"
       "add x11, x11, #0x10\n"
       "add x15, x15, #0x180\n"
-      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "fmla v10.4s, v19.4s, v0.s[3]\n"
       "ldr d4, [x15, #0x0]\n"
-      "ldr x10, [x15, #0x8]\n"
-      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "ldr x20, [x15, #0x8]\n"
+      "fmla v11.4s, v18.4s, v0.s[3]\n"
       "ldr d5, [x15, #0x10]\n"
-      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v17.4s, v0.s[3]\n"
       "ldr d6, [x15, #0x20]\n"
-      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v0.s[3]\n"
       "ldr d0, [x11, #0x0]\n"
       "sub x12, x12, #0x4\n"
       "ldr d7, [x15, #0x30]\n"
       "cmp x12, #0x8\n"
-      "ldr x9, [x15, #0x18]\n"
-      "mov v4.d[1], x10\n"
-      "ldr x28, [x15, #0x28]\n"
-      "mov v5.d[1], x9\n"
-      "ldr x26, [x11, #0x8]\n"
-      "mov v6.d[1], x28\n"
-      "ldr x27, [x15, #0x38]\n"
-      "mov v0.d[1], x26\n"
-      "mov v7.d[1], x27\n"
+      "ldr x21, [x15, #0x18]\n"
+      "mov v4.d[1], x20\n"
+      "ldr x20, [x15, #0x28]\n"
+      "mov v5.d[1], x21\n"
+      "ldr x21, [x11, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x15, #0x38]\n"
+      "mov v0.d[1], x21\n"
+      "mov v7.d[1], x20\n"
       "prfm pldl1keep, [x11, #0x80]\n"
       "bge 22b\n"
       "23:"  // Height 1: Multiply loop: Single iteration only
       "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "ldr q4, [x15, #0x40]\n"
+      "ldr q19, [x15, #0x40]\n"
       "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "ldr q5, [x15, #0x50]\n"
+      "ldr q18, [x15, #0x50]\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q6, [x15, #0x60]\n"
+      "ldr q17, [x15, #0x60]\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x15, #0x70]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "ldr q4, [x15, #0x80]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "ldr q5, [x15, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x15, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x15, #0xb0]\n"
-      "fmla v10.4s, v4.4s, v0.s[1]\n"
-      "ldr q4, [x15, #0xc0]\n"
-      "fmla v11.4s, v5.4s, v0.s[1]\n"
-      "ldr q5, [x15, #0xd0]\n"
-      "fmla v12.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x15, #0xe0]\n"
-      "fmla v13.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x15, #0xf0]\n"
-      "fmla v8.4s, v4.4s, v0.s[2]\n"
-      "ldr q4, [x15, #0x100]\n"
-      "fmla v9.4s, v5.4s, v0.s[2]\n"
-      "ldr q5, [x15, #0x110]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x15, #0x120]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x15, #0x130]\n"
-      "fmla v12.4s, v4.4s, v0.s[2]\n"
-      "ldr q4, [x15, #0x140]\n"
-      "fmla v13.4s, v5.4s, v0.s[2]\n"
-      "ldr q5, [x15, #0x150]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr q6, [x15, #0x160]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr q7, [x15, #0x170]\n"
+      "ldr q16, [x15, #0x70]\n"
+      "fmla v12.4s, v19.4s, v0.s[0]\n"
+      "ldr q19, [x15, #0x80]\n"
+      "fmla v13.4s, v18.4s, v0.s[0]\n"
+      "ldr q18, [x15, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x15, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x15, #0xb0]\n"
+      "fmla v10.4s, v19.4s, v0.s[1]\n"
+      "ldr q19, [x15, #0xc0]\n"
+      "fmla v11.4s, v18.4s, v0.s[1]\n"
+      "ldr q18, [x15, #0xd0]\n"
+      "fmla v12.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x15, #0xe0]\n"
+      "fmla v13.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x15, #0xf0]\n"
+      "fmla v8.4s, v19.4s, v0.s[2]\n"
+      "ldr q19, [x15, #0x100]\n"
+      "fmla v9.4s, v18.4s, v0.s[2]\n"
+      "ldr q18, [x15, #0x110]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x15, #0x120]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x15, #0x130]\n"
+      "fmla v12.4s, v19.4s, v0.s[2]\n"
+      "ldr q19, [x15, #0x140]\n"
+      "fmla v13.4s, v18.4s, v0.s[2]\n"
+      "ldr q18, [x15, #0x150]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x15, #0x160]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x15, #0x170]\n"
       "add x11, x11, #0x10\n"
       "sub x12, x12, #0x4\n"
-      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "fmla v10.4s, v19.4s, v0.s[3]\n"
       "prfm pldl1keep, [x11, #0x80]\n"
-      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v11.4s, v18.4s, v0.s[3]\n"
       "add x15, x15, #0x180\n"
-      "fmla v12.4s, v6.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v12.4s, v17.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v0.s[3]\n"
       "24:"  // Height 1: Multiply loop: Main loop skip
       "cbz x12, 26f\n"
       "25:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
+      "ldr s17, [x11], #0x4\n"
       "sub x12, x12, #0x1\n"
-      "ldr q4, [x15, #0x0]\n"
-      "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "ldr q5, [x15, #0x10]\n"
-      "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr q4, [x15, #0x40]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "ldr q5, [x15, #0x50]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "ldr q16, [x15, #0x0]\n"
+      "fmla v8.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x15, #0x10]\n"
+      "fmla v9.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x15, #0x20]\n"
+      "fmla v10.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x15, #0x30]\n"
+      "fmla v11.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x15, #0x40]\n"
+      "fmla v12.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x15, #0x50]\n"
+      "fmla v13.4s, v16.4s, v17.s[0]\n"
       "add x15, x15, #0x60\n"
       "cbnz x12, 25b\n"
       "26:"  // Height 1: Multiply loop: No odd multiplies
@@ -426,21 +425,21 @@
       "prfm pstl1keep, [x14, #0x0]\n"
       "tbz %x[flags], #1, 27f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v16.4s\n"
+      "fmin v9.4s, v9.4s, v16.4s\n"
+      "fmin v10.4s, v10.4s, v16.4s\n"
+      "fmin v11.4s, v11.4s, v16.4s\n"
+      "fmin v12.4s, v12.4s, v16.4s\n"
+      "fmin v13.4s, v13.4s, v16.4s\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
+      "fmax v12.4s, v12.4s, v16.4s\n"
+      "fmax v13.4s, v13.4s, v16.4s\n"
       "27:"  // Height 1: No activation
       "cmp x16, #0x18\n"
       "bge 40f\n"
@@ -701,26 +700,26 @@
       "60:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w12, [x20, x13, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 61f\n"
-      "ldr x21, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x11, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x11, [x20, #0x0]\n"
+      "ldr x10, [x20, #0x8]\n"
       "cbnz x13, 62f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x11, x11, x20, LSL #2\n"
-      "add x25, x25, x20, LSL #2\n"
+      "add x10, x10, x20, LSL #2\n"
       "b 62f\n"
       "61:"  // Height 2: setup direct input
       "mov x11, %x[input_ptr]\n"
-      "add x25, x11, x20, LSL #2\n"
+      "add x10, x11, x21, LSL #2\n"
       "62:"  // Height 2: input setup done
       "cmp x12, #0x4\n"
       "blt 65f\n"
       "ldr q0, [x11, #0x0]\n"
       "cmp x12, #0x8\n"
-      "ldr q1, [x25, #0x0]\n"
+      "ldr q1, [x10, #0x0]\n"
       "ldr q4, [x15, #0x0]\n"
       "ldr q5, [x15, #0x10]\n"
       "ldr q6, [x15, #0x20]\n"
@@ -728,239 +727,239 @@
       "blt 64f\n"
       "63:"  // Height 2: Multiply loop: Main loop head
       "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "ldr x10, [x15, #0x48]\n"
+      "ldr x23, [x15, #0x48]\n"
       "fmla v14.4s, v4.4s, v1.s[0]\n"
-      "ldr d4, [x15, #0x40]\n"
+      "ldr d23, [x15, #0x40]\n"
       "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "ldr x9, [x15, #0x58]\n"
+      "ldr x22, [x15, #0x58]\n"
       "fmla v15.4s, v5.4s, v1.s[0]\n"
-      "ldr d5, [x15, #0x50]\n"
+      "ldr d22, [x15, #0x50]\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr x28, [x15, #0x68]\n"
+      "ldr x21, [x15, #0x68]\n"
       "fmla v16.4s, v6.4s, v1.s[0]\n"
-      "ldr d6, [x15, #0x60]\n"
+      "ldr d21, [x15, #0x60]\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr x27, [x15, #0x78]\n"
+      "ldr x20, [x15, #0x78]\n"
       "fmla v17.4s, v7.4s, v1.s[0]\n"
-      "ldr d7, [x15, #0x70]\n"
-      "mov v4.d[1], x10\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "mov v5.d[1], x9\n"
-      "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "ldr d4, [x15, #0x80]\n"
-      "mov v6.d[1], x28\n"
-      "mov v7.d[1], x27\n"
-      "ldr x10, [x15, #0x88]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "ldr x9, [x15, #0x98]\n"
-      "fmla v19.4s, v5.4s, v1.s[0]\n"
-      "ldr d5, [x15, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr x28, [x15, #0xa8]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr d6, [x15, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr x27, [x15, #0xb8]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr d7, [x15, #0xb0]\n"
-      "mov v4.d[1], x10\n"
-      "fmla v10.4s, v4.4s, v0.s[1]\n"
-      "mov v5.d[1], x9\n"
-      "fmla v16.4s, v4.4s, v1.s[1]\n"
-      "ldr d4, [x15, #0xc0]\n"
-      "mov v6.d[1], x28\n"
-      "mov v7.d[1], x27\n"
-      "ldr x10, [x15, #0xc8]\n"
-      "fmla v11.4s, v5.4s, v0.s[1]\n"
-      "ldr x9, [x15, #0xd8]\n"
-      "fmla v17.4s, v5.4s, v1.s[1]\n"
-      "ldr d5, [x15, #0xd0]\n"
-      "fmla v12.4s, v6.4s, v0.s[1]\n"
-      "ldr x28, [x15, #0xe8]\n"
-      "fmla v18.4s, v6.4s, v1.s[1]\n"
-      "ldr d6, [x15, #0xe0]\n"
-      "fmla v13.4s, v7.4s, v0.s[1]\n"
-      "ldr x27, [x15, #0xf8]\n"
-      "fmla v19.4s, v7.4s, v1.s[1]\n"
-      "ldr d7, [x15, #0xf0]\n"
-      "mov v4.d[1], x10\n"
-      "fmla v8.4s, v4.4s, v0.s[2]\n"
-      "mov v5.d[1], x9\n"
-      "fmla v14.4s, v4.4s, v1.s[2]\n"
-      "ldr d4, [x15, #0x100]\n"
-      "mov v6.d[1], x28\n"
-      "mov v7.d[1], x27\n"
-      "ldr x10, [x15, #0x108]\n"
-      "fmla v9.4s, v5.4s, v0.s[2]\n"
-      "ldr x9, [x15, #0x118]\n"
-      "fmla v15.4s, v5.4s, v1.s[2]\n"
-      "ldr d5, [x15, #0x110]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr x28, [x15, #0x128]\n"
-      "fmla v16.4s, v6.4s, v1.s[2]\n"
-      "ldr d6, [x15, #0x120]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr x27, [x15, #0x138]\n"
-      "fmla v17.4s, v7.4s, v1.s[2]\n"
-      "ldr d7, [x15, #0x130]\n"
-      "mov v4.d[1], x10\n"
-      "fmla v12.4s, v4.4s, v0.s[2]\n"
-      "mov v5.d[1], x9\n"
-      "fmla v18.4s, v4.4s, v1.s[2]\n"
-      "ldr d4, [x15, #0x140]\n"
-      "mov v6.d[1], x28\n"
-      "mov v7.d[1], x27\n"
-      "ldr x10, [x15, #0x148]\n"
-      "fmla v13.4s, v5.4s, v0.s[2]\n"
-      "ldr x9, [x15, #0x158]\n"
-      "fmla v19.4s, v5.4s, v1.s[2]\n"
-      "ldr d5, [x15, #0x150]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr x28, [x15, #0x168]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "ldr d6, [x15, #0x160]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr x27, [x15, #0x178]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr d7, [x15, #0x170]\n"
-      "mov v4.d[1], x10\n"
+      "ldr d20, [x15, #0x70]\n"
+      "mov v23.d[1], x23\n"
+      "fmla v12.4s, v23.4s, v0.s[0]\n"
+      "mov v22.d[1], x22\n"
+      "fmla v18.4s, v23.4s, v1.s[0]\n"
+      "ldr d23, [x15, #0x80]\n"
+      "mov v21.d[1], x21\n"
+      "mov v20.d[1], x20\n"
+      "ldr x23, [x15, #0x88]\n"
+      "fmla v13.4s, v22.4s, v0.s[0]\n"
+      "ldr x22, [x15, #0x98]\n"
+      "fmla v19.4s, v22.4s, v1.s[0]\n"
+      "ldr d22, [x15, #0x90]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "ldr x21, [x15, #0xa8]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "ldr d21, [x15, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "ldr x20, [x15, #0xb8]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "ldr d20, [x15, #0xb0]\n"
+      "mov v23.d[1], x23\n"
+      "fmla v10.4s, v23.4s, v0.s[1]\n"
+      "mov v22.d[1], x22\n"
+      "fmla v16.4s, v23.4s, v1.s[1]\n"
+      "ldr d23, [x15, #0xc0]\n"
+      "mov v21.d[1], x21\n"
+      "mov v20.d[1], x20\n"
+      "ldr x23, [x15, #0xc8]\n"
+      "fmla v11.4s, v22.4s, v0.s[1]\n"
+      "ldr x22, [x15, #0xd8]\n"
+      "fmla v17.4s, v22.4s, v1.s[1]\n"
+      "ldr d22, [x15, #0xd0]\n"
+      "fmla v12.4s, v21.4s, v0.s[1]\n"
+      "ldr x21, [x15, #0xe8]\n"
+      "fmla v18.4s, v21.4s, v1.s[1]\n"
+      "ldr d21, [x15, #0xe0]\n"
+      "fmla v13.4s, v20.4s, v0.s[1]\n"
+      "ldr x20, [x15, #0xf8]\n"
+      "fmla v19.4s, v20.4s, v1.s[1]\n"
+      "ldr d20, [x15, #0xf0]\n"
+      "mov v23.d[1], x23\n"
+      "fmla v8.4s, v23.4s, v0.s[2]\n"
+      "mov v22.d[1], x22\n"
+      "fmla v14.4s, v23.4s, v1.s[2]\n"
+      "ldr d23, [x15, #0x100]\n"
+      "mov v21.d[1], x21\n"
+      "mov v20.d[1], x20\n"
+      "ldr x23, [x15, #0x108]\n"
+      "fmla v9.4s, v22.4s, v0.s[2]\n"
+      "ldr x22, [x15, #0x118]\n"
+      "fmla v15.4s, v22.4s, v1.s[2]\n"
+      "ldr d22, [x15, #0x110]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "ldr x21, [x15, #0x128]\n"
+      "fmla v16.4s, v21.4s, v1.s[2]\n"
+      "ldr d21, [x15, #0x120]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "ldr x20, [x15, #0x138]\n"
+      "fmla v17.4s, v20.4s, v1.s[2]\n"
+      "ldr d20, [x15, #0x130]\n"
+      "mov v23.d[1], x23\n"
+      "fmla v12.4s, v23.4s, v0.s[2]\n"
+      "mov v22.d[1], x22\n"
+      "fmla v18.4s, v23.4s, v1.s[2]\n"
+      "ldr d23, [x15, #0x140]\n"
+      "mov v21.d[1], x21\n"
+      "mov v20.d[1], x20\n"
+      "ldr x23, [x15, #0x148]\n"
+      "fmla v13.4s, v22.4s, v0.s[2]\n"
+      "ldr x22, [x15, #0x158]\n"
+      "fmla v19.4s, v22.4s, v1.s[2]\n"
+      "ldr d22, [x15, #0x150]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "ldr x21, [x15, #0x168]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "ldr d21, [x15, #0x160]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "ldr x20, [x15, #0x178]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "ldr d20, [x15, #0x170]\n"
+      "mov v23.d[1], x23\n"
       "add x11, x11, #0x10\n"
-      "mov v5.d[1], x9\n"
-      "add x25, x25, #0x10\n"
-      "mov v6.d[1], x28\n"
+      "mov v22.d[1], x22\n"
+      "add x10, x10, #0x10\n"
+      "mov v21.d[1], x21\n"
       "add x15, x15, #0x180\n"
-      "mov v7.d[1], x27\n"
-      "fmla v10.4s, v4.4s, v0.s[3]\n"
-      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.4s, v23.4s, v0.s[3]\n"
+      "fmla v16.4s, v23.4s, v1.s[3]\n"
       "ldr d4, [x15, #0x0]\n"
-      "ldr x10, [x15, #0x8]\n"
-      "fmla v11.4s, v5.4s, v0.s[3]\n"
-      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "ldr x21, [x15, #0x8]\n"
+      "fmla v11.4s, v22.4s, v0.s[3]\n"
+      "fmla v17.4s, v22.4s, v1.s[3]\n"
       "ldr d5, [x15, #0x10]\n"
-      "ldr x9, [x15, #0x18]\n"
-      "fmla v12.4s, v6.4s, v0.s[3]\n"
-      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "ldr x20, [x15, #0x18]\n"
+      "fmla v12.4s, v21.4s, v0.s[3]\n"
+      "fmla v18.4s, v21.4s, v1.s[3]\n"
       "ldr d6, [x15, #0x20]\n"
-      "ldr x28, [x15, #0x28]\n"
-      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "ldr x23, [x15, #0x28]\n"
+      "fmla v13.4s, v20.4s, v0.s[3]\n"
       "ldr d0, [x11, #0x0]\n"
-      "fmla v19.4s, v7.4s, v1.s[3]\n"
-      "ldr d1, [x25, #0x0]\n"
+      "fmla v19.4s, v20.4s, v1.s[3]\n"
+      "ldr d1, [x10, #0x0]\n"
       "sub x12, x12, #0x4\n"
       "ldr d7, [x15, #0x30]\n"
       "cmp x12, #0x8\n"
-      "ldr x26, [x11, #0x8]\n"
-      "mov v4.d[1], x10\n"
-      "ldr x24, [x25, #0x8]\n"
-      "mov v5.d[1], x9\n"
-      "ldr x27, [x15, #0x38]\n"
-      "mov v6.d[1], x28\n"
+      "ldr x22, [x11, #0x8]\n"
+      "mov v4.d[1], x21\n"
+      "ldr x21, [x10, #0x8]\n"
+      "mov v5.d[1], x20\n"
+      "ldr x20, [x15, #0x38]\n"
+      "mov v6.d[1], x23\n"
       "prfm pldl1keep, [x11, #0x80]\n"
-      "mov v0.d[1], x26\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "mov v1.d[1], x24\n"
-      "mov v7.d[1], x27\n"
+      "mov v0.d[1], x22\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "mov v1.d[1], x21\n"
+      "mov v7.d[1], x20\n"
       "bge 63b\n"
       "64:"  // Height 2: Multiply loop: Single iteration only
       "fmla v8.4s, v4.4s, v0.s[0]\n"
       "add x11, x11, #0x10\n"
       "fmla v14.4s, v4.4s, v1.s[0]\n"
-      "ldr q4, [x15, #0x40]\n"
+      "ldr q23, [x15, #0x40]\n"
       "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       "fmla v15.4s, v5.4s, v1.s[0]\n"
-      "ldr q5, [x15, #0x50]\n"
+      "ldr q22, [x15, #0x50]\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
       "sub x12, x12, #0x4\n"
       "fmla v16.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x15, #0x60]\n"
+      "ldr q21, [x15, #0x60]\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
       "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v17.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x15, #0x70]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "ldr q4, [x15, #0x80]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "fmla v19.4s, v5.4s, v1.s[0]\n"
-      "ldr q5, [x15, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x15, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x15, #0xb0]\n"
-      "fmla v10.4s, v4.4s, v0.s[1]\n"
-      "fmla v16.4s, v4.4s, v1.s[1]\n"
-      "ldr q4, [x15, #0xc0]\n"
-      "fmla v11.4s, v5.4s, v0.s[1]\n"
-      "fmla v17.4s, v5.4s, v1.s[1]\n"
-      "ldr q5, [x15, #0xd0]\n"
-      "fmla v12.4s, v6.4s, v0.s[1]\n"
-      "fmla v18.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x15, #0xe0]\n"
-      "fmla v13.4s, v7.4s, v0.s[1]\n"
-      "fmla v19.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x15, #0xf0]\n"
-      "fmla v8.4s, v4.4s, v0.s[2]\n"
-      "fmla v14.4s, v4.4s, v1.s[2]\n"
-      "ldr q4, [x15, #0x100]\n"
-      "fmla v9.4s, v5.4s, v0.s[2]\n"
-      "fmla v15.4s, v5.4s, v1.s[2]\n"
-      "ldr q5, [x15, #0x110]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v16.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x15, #0x120]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v17.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x15, #0x130]\n"
-      "fmla v12.4s, v4.4s, v0.s[2]\n"
-      "fmla v18.4s, v4.4s, v1.s[2]\n"
-      "ldr q4, [x15, #0x140]\n"
-      "fmla v13.4s, v5.4s, v0.s[2]\n"
-      "fmla v19.4s, v5.4s, v1.s[2]\n"
-      "ldr q5, [x15, #0x150]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "ldr q6, [x15, #0x160]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr q7, [x15, #0x170]\n"
-      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "ldr q20, [x15, #0x70]\n"
+      "fmla v12.4s, v23.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v18.4s, v23.4s, v1.s[0]\n"
+      "ldr q23, [x15, #0x80]\n"
+      "fmla v13.4s, v22.4s, v0.s[0]\n"
+      "fmla v19.4s, v22.4s, v1.s[0]\n"
+      "ldr q22, [x15, #0x90]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "ldr q21, [x15, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "ldr q20, [x15, #0xb0]\n"
+      "fmla v10.4s, v23.4s, v0.s[1]\n"
+      "fmla v16.4s, v23.4s, v1.s[1]\n"
+      "ldr q23, [x15, #0xc0]\n"
+      "fmla v11.4s, v22.4s, v0.s[1]\n"
+      "fmla v17.4s, v22.4s, v1.s[1]\n"
+      "ldr q22, [x15, #0xd0]\n"
+      "fmla v12.4s, v21.4s, v0.s[1]\n"
+      "fmla v18.4s, v21.4s, v1.s[1]\n"
+      "ldr q21, [x15, #0xe0]\n"
+      "fmla v13.4s, v20.4s, v0.s[1]\n"
+      "fmla v19.4s, v20.4s, v1.s[1]\n"
+      "ldr q20, [x15, #0xf0]\n"
+      "fmla v8.4s, v23.4s, v0.s[2]\n"
+      "fmla v14.4s, v23.4s, v1.s[2]\n"
+      "ldr q23, [x15, #0x100]\n"
+      "fmla v9.4s, v22.4s, v0.s[2]\n"
+      "fmla v15.4s, v22.4s, v1.s[2]\n"
+      "ldr q22, [x15, #0x110]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v16.4s, v21.4s, v1.s[2]\n"
+      "ldr q21, [x15, #0x120]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "fmla v17.4s, v20.4s, v1.s[2]\n"
+      "ldr q20, [x15, #0x130]\n"
+      "fmla v12.4s, v23.4s, v0.s[2]\n"
+      "fmla v18.4s, v23.4s, v1.s[2]\n"
+      "ldr q23, [x15, #0x140]\n"
+      "fmla v13.4s, v22.4s, v0.s[2]\n"
+      "fmla v19.4s, v22.4s, v1.s[2]\n"
+      "ldr q22, [x15, #0x150]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "ldr q21, [x15, #0x160]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "ldr q20, [x15, #0x170]\n"
+      "fmla v10.4s, v23.4s, v0.s[3]\n"
       "add x15, x15, #0x180\n"
-      "fmla v16.4s, v4.4s, v1.s[3]\n"
-      "fmla v11.4s, v5.4s, v0.s[3]\n"
-      "fmla v17.4s, v5.4s, v1.s[3]\n"
-      "fmla v12.4s, v6.4s, v0.s[3]\n"
-      "fmla v18.4s, v6.4s, v1.s[3]\n"
-      "fmla v13.4s, v7.4s, v0.s[3]\n"
-      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "fmla v16.4s, v23.4s, v1.s[3]\n"
+      "fmla v11.4s, v22.4s, v0.s[3]\n"
+      "fmla v17.4s, v22.4s, v1.s[3]\n"
+      "fmla v12.4s, v21.4s, v0.s[3]\n"
+      "fmla v18.4s, v21.4s, v1.s[3]\n"
+      "fmla v13.4s, v20.4s, v0.s[3]\n"
+      "fmla v19.4s, v20.4s, v1.s[3]\n"
       "65:"  // Height 2: Multiply loop: Main loop skip
       "cbz x12, 67f\n"
       "66:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
+      "ldr s25, [x11], #0x4\n"
       "sub x12, x12, #0x1\n"
-      "ldr s1, [x25], #0x4\n"
-      "ldr q4, [x15, #0x0]\n"
-      "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "ldr q5, [x15, #0x10]\n"
-      "fmla v14.4s, v4.4s, v1.s[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      "fmla v15.4s, v5.4s, v1.s[0]\n"
-      "ldr q4, [x15, #0x40]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q5, [x15, #0x50]\n"
-      "fmla v16.4s, v6.4s, v1.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr s24, [x10], #0x4\n"
+      "ldr q21, [x15, #0x0]\n"
+      "fmla v8.4s, v21.4s, v25.s[0]\n"
+      "ldr q20, [x15, #0x10]\n"
+      "fmla v14.4s, v21.4s, v24.s[0]\n"
+      "ldr q23, [x15, #0x20]\n"
+      "fmla v9.4s, v20.4s, v25.s[0]\n"
+      "ldr q22, [x15, #0x30]\n"
+      "fmla v15.4s, v20.4s, v24.s[0]\n"
+      "ldr q21, [x15, #0x40]\n"
+      "fmla v10.4s, v23.4s, v25.s[0]\n"
+      "ldr q20, [x15, #0x50]\n"
+      "fmla v16.4s, v23.4s, v24.s[0]\n"
+      "fmla v11.4s, v22.4s, v25.s[0]\n"
       "add x15, x15, #0x60\n"
-      "fmla v17.4s, v7.4s, v1.s[0]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v17.4s, v22.4s, v24.s[0]\n"
+      "fmla v12.4s, v21.4s, v25.s[0]\n"
+      "fmla v18.4s, v21.4s, v24.s[0]\n"
+      "fmla v13.4s, v20.4s, v25.s[0]\n"
+      "fmla v19.4s, v20.4s, v24.s[0]\n"
       "cbnz x12, 66b\n"
       "67:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -973,33 +972,33 @@
       "prfm pstl1keep, [x23, #0x0]\n"
       "tbz %x[flags], #1, 68f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v20.4s\n"
+      "fmin v9.4s, v9.4s, v20.4s\n"
+      "fmin v10.4s, v10.4s, v20.4s\n"
+      "fmin v11.4s, v11.4s, v20.4s\n"
+      "fmin v12.4s, v12.4s, v20.4s\n"
+      "fmin v13.4s, v13.4s, v20.4s\n"
+      "fmin v14.4s, v14.4s, v20.4s\n"
+      "fmin v15.4s, v15.4s, v20.4s\n"
+      "fmin v16.4s, v16.4s, v20.4s\n"
+      "fmin v17.4s, v17.4s, v20.4s\n"
+      "fmin v18.4s, v18.4s, v20.4s\n"
+      "fmin v19.4s, v19.4s, v20.4s\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v20.4s\n"
+      "fmax v9.4s, v9.4s, v20.4s\n"
+      "fmax v10.4s, v10.4s, v20.4s\n"
+      "fmax v11.4s, v11.4s, v20.4s\n"
+      "fmax v12.4s, v12.4s, v20.4s\n"
+      "fmax v13.4s, v13.4s, v20.4s\n"
+      "fmax v14.4s, v14.4s, v20.4s\n"
+      "fmax v15.4s, v15.4s, v20.4s\n"
+      "fmax v16.4s, v16.4s, v20.4s\n"
+      "fmax v17.4s, v17.4s, v20.4s\n"
+      "fmax v18.4s, v18.4s, v20.4s\n"
+      "fmax v19.4s, v19.4s, v20.4s\n"
       "68:"  // Height 2: No activation
       "cmp x16, #0x18\n"
       "bge 81f\n"
@@ -1339,30 +1338,30 @@
       "101:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w12, [x20, x13, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 102f\n"
-      "ldr x21, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x11, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x23, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x11, [x20, #0x0]\n"
+      "ldr x10, [x20, #0x8]\n"
+      "ldr x9, [x20, #0x10]\n"
       "cbnz x13, 103f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x11, x11, x20, LSL #2\n"
-      "add x25, x25, x20, LSL #2\n"
-      "add x23, x23, x20, LSL #2\n"
+      "add x10, x10, x20, LSL #2\n"
+      "add x9, x9, x20, LSL #2\n"
       "b 103f\n"
       "102:"  // Height 3: setup direct input
       "mov x11, %x[input_ptr]\n"
-      "add x25, x11, x20, LSL #2\n"
-      "add x23, x25, x20, LSL #2\n"
+      "add x10, x11, x21, LSL #2\n"
+      "add x9, x10, x21, LSL #2\n"
       "103:"  // Height 3: input setup done
       "cmp x12, #0x4\n"
       "blt 106f\n"
       "ldr q0, [x11, #0x0]\n"
       "cmp x12, #0x8\n"
-      "ldr q1, [x25, #0x0]\n"
-      "ldr q2, [x23, #0x0]\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q2, [x9, #0x0]\n"
       "ldr q4, [x15, #0x0]\n"
       "ldr q5, [x15, #0x10]\n"
       "ldr q6, [x15, #0x20]\n"
@@ -1370,301 +1369,301 @@
       "blt 105f\n"
       "104:"  // Height 3: Multiply loop: Main loop head
       "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "ldr x10, [x15, #0x48]\n"
+      "ldr x23, [x15, #0x48]\n"
       "fmla v14.4s, v4.4s, v1.s[0]\n"
-      "ldr x9, [x15, #0x58]\n"
+      "ldr x22, [x15, #0x58]\n"
       "fmla v20.4s, v4.4s, v2.s[0]\n"
-      "ldr d4, [x15, #0x40]\n"
+      "ldr d29, [x15, #0x40]\n"
       "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "ldr x28, [x15, #0x68]\n"
+      "ldr x21, [x15, #0x68]\n"
       "fmla v15.4s, v5.4s, v1.s[0]\n"
-      "ldr x27, [x15, #0x78]\n"
+      "ldr x20, [x15, #0x78]\n"
       "fmla v21.4s, v5.4s, v2.s[0]\n"
-      "ldr d5, [x15, #0x50]\n"
+      "ldr d28, [x15, #0x50]\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "mov v4.d[1], x10\n"
+      "mov v29.d[1], x23\n"
       "fmla v16.4s, v6.4s, v1.s[0]\n"
-      "mov v5.d[1], x9\n"
+      "mov v28.d[1], x22\n"
       "fmla v22.4s, v6.4s, v2.s[0]\n"
-      "ldr d6, [x15, #0x60]\n"
+      "ldr d27, [x15, #0x60]\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x28\n"
+      "mov v27.d[1], x21\n"
       "fmla v17.4s, v7.4s, v1.s[0]\n"
-      "ldr x10, [x15, #0x88]\n"
+      "ldr x23, [x15, #0x88]\n"
       "fmla v23.4s, v7.4s, v2.s[0]\n"
-      "ldr d7, [x15, #0x70]\n"
-      "mov v7.d[1], x27\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "ldr x9, [x15, #0x98]\n"
-      "fmla v24.4s, v4.4s, v2.s[0]\n"
-      "ldr d4, [x15, #0x80]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "ldr x28, [x15, #0xa8]\n"
-      "fmla v19.4s, v5.4s, v1.s[0]\n"
-      "ldr x27, [x15, #0xb8]\n"
-      "fmla v25.4s, v5.4s, v2.s[0]\n"
-      "ldr d5, [x15, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "mov v4.d[1], x10\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "mov v5.d[1], x9\n"
-      "fmla v20.4s, v6.4s, v2.s[1]\n"
-      "ldr d6, [x15, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x28\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr x10, [x15, #0xc8]\n"
-      "fmla v21.4s, v7.4s, v2.s[1]\n"
-      "ldr d7, [x15, #0xb0]\n"
-      "mov v7.d[1], x27\n"
-      "fmla v10.4s, v4.4s, v0.s[1]\n"
-      "fmla v16.4s, v4.4s, v1.s[1]\n"
-      "ldr x9, [x15, #0xd8]\n"
-      "fmla v22.4s, v4.4s, v2.s[1]\n"
-      "ldr d4, [x15, #0xc0]\n"
-      "fmla v11.4s, v5.4s, v0.s[1]\n"
-      "ldr x28, [x15, #0xe8]\n"
-      "fmla v17.4s, v5.4s, v1.s[1]\n"
-      "ldr x27, [x15, #0xf8]\n"
-      "fmla v23.4s, v5.4s, v2.s[1]\n"
-      "ldr d5, [x15, #0xd0]\n"
-      "fmla v12.4s, v6.4s, v0.s[1]\n"
-      "mov v4.d[1], x10\n"
-      "fmla v18.4s, v6.4s, v1.s[1]\n"
-      "mov v5.d[1], x9\n"
-      "fmla v24.4s, v6.4s, v2.s[1]\n"
-      "ldr d6, [x15, #0xe0]\n"
-      "fmla v13.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x28\n"
-      "fmla v19.4s, v7.4s, v1.s[1]\n"
-      "ldr x10, [x15, #0x108]\n"
-      "fmla v25.4s, v7.4s, v2.s[1]\n"
-      "ldr d7, [x15, #0xf0]\n"
-      "mov v7.d[1], x27\n"
-      "fmla v8.4s, v4.4s, v0.s[2]\n"
-      "fmla v14.4s, v4.4s, v1.s[2]\n"
-      "ldr x9, [x15, #0x118]\n"
-      "fmla v20.4s, v4.4s, v2.s[2]\n"
-      "ldr d4, [x15, #0x100]\n"
-      "fmla v9.4s, v5.4s, v0.s[2]\n"
-      "ldr x28, [x15, #0x128]\n"
-      "fmla v15.4s, v5.4s, v1.s[2]\n"
-      "ldr x27, [x15, #0x138]\n"
-      "fmla v21.4s, v5.4s, v2.s[2]\n"
-      "ldr d5, [x15, #0x110]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "mov v4.d[1], x10\n"
-      "fmla v16.4s, v6.4s, v1.s[2]\n"
-      "mov v5.d[1], x9\n"
-      "fmla v22.4s, v6.4s, v2.s[2]\n"
-      "ldr d6, [x15, #0x120]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "mov v6.d[1], x28\n"
-      "fmla v17.4s, v7.4s, v1.s[2]\n"
-      "ldr x10, [x15, #0x148]\n"
-      "fmla v23.4s, v7.4s, v2.s[2]\n"
-      "ldr d7, [x15, #0x130]\n"
-      "mov v7.d[1], x27\n"
-      "fmla v12.4s, v4.4s, v0.s[2]\n"
-      "fmla v18.4s, v4.4s, v1.s[2]\n"
-      "ldr x9, [x15, #0x158]\n"
-      "fmla v24.4s, v4.4s, v2.s[2]\n"
-      "ldr d4, [x15, #0x140]\n"
-      "fmla v13.4s, v5.4s, v0.s[2]\n"
-      "ldr x28, [x15, #0x168]\n"
-      "fmla v19.4s, v5.4s, v1.s[2]\n"
-      "ldr x27, [x15, #0x178]\n"
-      "fmla v25.4s, v5.4s, v2.s[2]\n"
-      "ldr d5, [x15, #0x150]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "mov v4.d[1], x10\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "mov v5.d[1], x9\n"
-      "fmla v20.4s, v6.4s, v2.s[3]\n"
-      "ldr d6, [x15, #0x160]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "mov v6.d[1], x28\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "ldr d26, [x15, #0x70]\n"
+      "mov v26.d[1], x20\n"
+      "fmla v12.4s, v29.4s, v0.s[0]\n"
+      "fmla v18.4s, v29.4s, v1.s[0]\n"
+      "ldr x22, [x15, #0x98]\n"
+      "fmla v24.4s, v29.4s, v2.s[0]\n"
+      "ldr d29, [x15, #0x80]\n"
+      "fmla v13.4s, v28.4s, v0.s[0]\n"
+      "ldr x21, [x15, #0xa8]\n"
+      "fmla v19.4s, v28.4s, v1.s[0]\n"
+      "ldr x20, [x15, #0xb8]\n"
+      "fmla v25.4s, v28.4s, v2.s[0]\n"
+      "ldr d28, [x15, #0x90]\n"
+      "fmla v8.4s, v27.4s, v0.s[1]\n"
+      "mov v29.d[1], x23\n"
+      "fmla v14.4s, v27.4s, v1.s[1]\n"
+      "mov v28.d[1], x22\n"
+      "fmla v20.4s, v27.4s, v2.s[1]\n"
+      "ldr d27, [x15, #0xa0]\n"
+      "fmla v9.4s, v26.4s, v0.s[1]\n"
+      "mov v27.d[1], x21\n"
+      "fmla v15.4s, v26.4s, v1.s[1]\n"
+      "ldr x23, [x15, #0xc8]\n"
+      "fmla v21.4s, v26.4s, v2.s[1]\n"
+      "ldr d26, [x15, #0xb0]\n"
+      "mov v26.d[1], x20\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v16.4s, v29.4s, v1.s[1]\n"
+      "ldr x22, [x15, #0xd8]\n"
+      "fmla v22.4s, v29.4s, v2.s[1]\n"
+      "ldr d29, [x15, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "ldr x21, [x15, #0xe8]\n"
+      "fmla v17.4s, v28.4s, v1.s[1]\n"
+      "ldr x20, [x15, #0xf8]\n"
+      "fmla v23.4s, v28.4s, v2.s[1]\n"
+      "ldr d28, [x15, #0xd0]\n"
+      "fmla v12.4s, v27.4s, v0.s[1]\n"
+      "mov v29.d[1], x23\n"
+      "fmla v18.4s, v27.4s, v1.s[1]\n"
+      "mov v28.d[1], x22\n"
+      "fmla v24.4s, v27.4s, v2.s[1]\n"
+      "ldr d27, [x15, #0xe0]\n"
+      "fmla v13.4s, v26.4s, v0.s[1]\n"
+      "mov v27.d[1], x21\n"
+      "fmla v19.4s, v26.4s, v1.s[1]\n"
+      "ldr x23, [x15, #0x108]\n"
+      "fmla v25.4s, v26.4s, v2.s[1]\n"
+      "ldr d26, [x15, #0xf0]\n"
+      "mov v26.d[1], x20\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "ldr x22, [x15, #0x118]\n"
+      "fmla v20.4s, v29.4s, v2.s[2]\n"
+      "ldr d29, [x15, #0x100]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "ldr x21, [x15, #0x128]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "ldr x20, [x15, #0x138]\n"
+      "fmla v21.4s, v28.4s, v2.s[2]\n"
+      "ldr d28, [x15, #0x110]\n"
+      "fmla v10.4s, v27.4s, v0.s[2]\n"
+      "mov v29.d[1], x23\n"
+      "fmla v16.4s, v27.4s, v1.s[2]\n"
+      "mov v28.d[1], x22\n"
+      "fmla v22.4s, v27.4s, v2.s[2]\n"
+      "ldr d27, [x15, #0x120]\n"
+      "fmla v11.4s, v26.4s, v0.s[2]\n"
+      "mov v27.d[1], x21\n"
+      "fmla v17.4s, v26.4s, v1.s[2]\n"
+      "ldr x23, [x15, #0x148]\n"
+      "fmla v23.4s, v26.4s, v2.s[2]\n"
+      "ldr d26, [x15, #0x130]\n"
+      "mov v26.d[1], x20\n"
+      "fmla v12.4s, v29.4s, v0.s[2]\n"
+      "fmla v18.4s, v29.4s, v1.s[2]\n"
+      "ldr x22, [x15, #0x158]\n"
+      "fmla v24.4s, v29.4s, v2.s[2]\n"
+      "ldr d29, [x15, #0x140]\n"
+      "fmla v13.4s, v28.4s, v0.s[2]\n"
+      "ldr x21, [x15, #0x168]\n"
+      "fmla v19.4s, v28.4s, v1.s[2]\n"
+      "ldr x20, [x15, #0x178]\n"
+      "fmla v25.4s, v28.4s, v2.s[2]\n"
+      "ldr d28, [x15, #0x150]\n"
+      "fmla v8.4s, v27.4s, v0.s[3]\n"
+      "mov v29.d[1], x23\n"
+      "fmla v14.4s, v27.4s, v1.s[3]\n"
+      "mov v28.d[1], x22\n"
+      "fmla v20.4s, v27.4s, v2.s[3]\n"
+      "ldr d27, [x15, #0x160]\n"
+      "fmla v9.4s, v26.4s, v0.s[3]\n"
+      "mov v27.d[1], x21\n"
+      "fmla v15.4s, v26.4s, v1.s[3]\n"
       "add x11, x11, #0x10\n"
-      "fmla v21.4s, v7.4s, v2.s[3]\n"
-      "ldr d7, [x15, #0x170]\n"
-      "mov v7.d[1], x27\n"
-      "add x25, x25, #0x10\n"
-      "add x23, x23, #0x10\n"
+      "fmla v21.4s, v26.4s, v2.s[3]\n"
+      "ldr d26, [x15, #0x170]\n"
+      "mov v26.d[1], x20\n"
+      "add x10, x10, #0x10\n"
+      "add x9, x9, #0x10\n"
       "add x15, x15, #0x180\n"
-      "fmla v10.4s, v4.4s, v0.s[3]\n"
-      "ldr x10, [x15, #0x8]\n"
-      "fmla v16.4s, v4.4s, v1.s[3]\n"
-      "ldr x9, [x15, #0x18]\n"
-      "fmla v22.4s, v4.4s, v2.s[3]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "ldr x26, [x15, #0x8]\n"
+      "fmla v16.4s, v29.4s, v1.s[3]\n"
+      "ldr x25, [x15, #0x18]\n"
+      "fmla v22.4s, v29.4s, v2.s[3]\n"
       "ldr d4, [x15, #0x0]\n"
-      "fmla v11.4s, v5.4s, v0.s[3]\n"
-      "ldr x28, [x15, #0x28]\n"
-      "fmla v17.4s, v5.4s, v1.s[3]\n"
-      "ldr x26, [x11, #0x8]\n"
-      "fmla v23.4s, v5.4s, v2.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "ldr x24, [x15, #0x28]\n"
+      "fmla v17.4s, v28.4s, v1.s[3]\n"
+      "ldr x23, [x11, #0x8]\n"
+      "fmla v23.4s, v28.4s, v2.s[3]\n"
       "ldr d5, [x15, #0x10]\n"
-      "fmla v12.4s, v6.4s, v0.s[3]\n"
-      "ldr x24, [x25, #0x8]\n"
-      "fmla v18.4s, v6.4s, v1.s[3]\n"
-      "ldr x22, [x23, #0x8]\n"
-      "fmla v24.4s, v6.4s, v2.s[3]\n"
+      "fmla v12.4s, v27.4s, v0.s[3]\n"
+      "ldr x22, [x10, #0x8]\n"
+      "fmla v18.4s, v27.4s, v1.s[3]\n"
+      "ldr x21, [x9, #0x8]\n"
+      "fmla v24.4s, v27.4s, v2.s[3]\n"
       "ldr d6, [x15, #0x20]\n"
-      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v26.4s, v0.s[3]\n"
       "ldr d0, [x11, #0x0]\n"
-      "fmla v19.4s, v7.4s, v1.s[3]\n"
-      "ldr d1, [x25, #0x0]\n"
-      "fmla v25.4s, v7.4s, v2.s[3]\n"
-      "ldr d2, [x23, #0x0]\n"
+      "fmla v19.4s, v26.4s, v1.s[3]\n"
+      "ldr d1, [x10, #0x0]\n"
+      "fmla v25.4s, v26.4s, v2.s[3]\n"
+      "ldr d2, [x9, #0x0]\n"
       "ldr d7, [x15, #0x30]\n"
       "sub x12, x12, #0x4\n"
-      "ldr x27, [x15, #0x38]\n"
+      "ldr x20, [x15, #0x38]\n"
       "cmp x12, #0x8\n"
       "prfm pldl1keep, [x11, #0x80]\n"
-      "mov v4.d[1], x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "mov v5.d[1], x9\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "mov v6.d[1], x28\n"
-      "mov v0.d[1], x26\n"
-      "mov v1.d[1], x24\n"
-      "mov v2.d[1], x22\n"
-      "mov v7.d[1], x27\n"
+      "mov v4.d[1], x26\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "mov v5.d[1], x25\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "mov v6.d[1], x24\n"
+      "mov v0.d[1], x23\n"
+      "mov v1.d[1], x22\n"
+      "mov v2.d[1], x21\n"
+      "mov v7.d[1], x20\n"
       "bge 104b\n"
       "105:"  // Height 3: Multiply loop: Single iteration only
       "fmla v8.4s, v4.4s, v0.s[0]\n"
       "add x11, x11, #0x10\n"
       "fmla v14.4s, v4.4s, v1.s[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       "fmla v20.4s, v4.4s, v2.s[0]\n"
-      "ldr q4, [x15, #0x40]\n"
+      "ldr q29, [x15, #0x40]\n"
       "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v15.4s, v5.4s, v1.s[0]\n"
       "sub x12, x12, #0x4\n"
       "fmla v21.4s, v5.4s, v2.s[0]\n"
-      "ldr q5, [x15, #0x50]\n"
+      "ldr q28, [x15, #0x50]\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
       "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v16.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       "fmla v22.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x15, #0x60]\n"
+      "ldr q27, [x15, #0x60]\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "fmla v17.4s, v7.4s, v1.s[0]\n"
       "fmla v23.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x15, #0x70]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "fmla v24.4s, v4.4s, v2.s[0]\n"
-      "ldr q4, [x15, #0x80]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "fmla v19.4s, v5.4s, v1.s[0]\n"
-      "fmla v25.4s, v5.4s, v2.s[0]\n"
-      "ldr q5, [x15, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v20.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x15, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v21.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x15, #0xb0]\n"
-      "fmla v10.4s, v4.4s, v0.s[1]\n"
-      "fmla v16.4s, v4.4s, v1.s[1]\n"
-      "fmla v22.4s, v4.4s, v2.s[1]\n"
-      "ldr q4, [x15, #0xc0]\n"
-      "fmla v11.4s, v5.4s, v0.s[1]\n"
-      "fmla v17.4s, v5.4s, v1.s[1]\n"
-      "fmla v23.4s, v5.4s, v2.s[1]\n"
-      "ldr q5, [x15, #0xd0]\n"
-      "fmla v12.4s, v6.4s, v0.s[1]\n"
-      "fmla v18.4s, v6.4s, v1.s[1]\n"
-      "fmla v24.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x15, #0xe0]\n"
-      "fmla v13.4s, v7.4s, v0.s[1]\n"
-      "fmla v19.4s, v7.4s, v1.s[1]\n"
-      "fmla v25.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x15, #0xf0]\n"
-      "fmla v8.4s, v4.4s, v0.s[2]\n"
-      "fmla v14.4s, v4.4s, v1.s[2]\n"
-      "fmla v20.4s, v4.4s, v2.s[2]\n"
-      "ldr q4, [x15, #0x100]\n"
-      "fmla v9.4s, v5.4s, v0.s[2]\n"
-      "fmla v15.4s, v5.4s, v1.s[2]\n"
-      "fmla v21.4s, v5.4s, v2.s[2]\n"
-      "ldr q5, [x15, #0x110]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v16.4s, v6.4s, v1.s[2]\n"
-      "fmla v22.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x15, #0x120]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v17.4s, v7.4s, v1.s[2]\n"
-      "fmla v23.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x15, #0x130]\n"
-      "fmla v12.4s, v4.4s, v0.s[2]\n"
-      "fmla v18.4s, v4.4s, v1.s[2]\n"
-      "fmla v24.4s, v4.4s, v2.s[2]\n"
-      "ldr q4, [x15, #0x140]\n"
-      "fmla v13.4s, v5.4s, v0.s[2]\n"
-      "fmla v19.4s, v5.4s, v1.s[2]\n"
-      "fmla v25.4s, v5.4s, v2.s[2]\n"
-      "ldr q5, [x15, #0x150]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v20.4s, v6.4s, v2.s[3]\n"
-      "ldr q6, [x15, #0x160]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v21.4s, v7.4s, v2.s[3]\n"
-      "ldr q7, [x15, #0x170]\n"
-      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "ldr q26, [x15, #0x70]\n"
+      "fmla v12.4s, v29.4s, v0.s[0]\n"
+      "fmla v18.4s, v29.4s, v1.s[0]\n"
+      "fmla v24.4s, v29.4s, v2.s[0]\n"
+      "ldr q29, [x15, #0x80]\n"
+      "fmla v13.4s, v28.4s, v0.s[0]\n"
+      "fmla v19.4s, v28.4s, v1.s[0]\n"
+      "fmla v25.4s, v28.4s, v2.s[0]\n"
+      "ldr q28, [x15, #0x90]\n"
+      "fmla v8.4s, v27.4s, v0.s[1]\n"
+      "fmla v14.4s, v27.4s, v1.s[1]\n"
+      "fmla v20.4s, v27.4s, v2.s[1]\n"
+      "ldr q27, [x15, #0xa0]\n"
+      "fmla v9.4s, v26.4s, v0.s[1]\n"
+      "fmla v15.4s, v26.4s, v1.s[1]\n"
+      "fmla v21.4s, v26.4s, v2.s[1]\n"
+      "ldr q26, [x15, #0xb0]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v16.4s, v29.4s, v1.s[1]\n"
+      "fmla v22.4s, v29.4s, v2.s[1]\n"
+      "ldr q29, [x15, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v17.4s, v28.4s, v1.s[1]\n"
+      "fmla v23.4s, v28.4s, v2.s[1]\n"
+      "ldr q28, [x15, #0xd0]\n"
+      "fmla v12.4s, v27.4s, v0.s[1]\n"
+      "fmla v18.4s, v27.4s, v1.s[1]\n"
+      "fmla v24.4s, v27.4s, v2.s[1]\n"
+      "ldr q27, [x15, #0xe0]\n"
+      "fmla v13.4s, v26.4s, v0.s[1]\n"
+      "fmla v19.4s, v26.4s, v1.s[1]\n"
+      "fmla v25.4s, v26.4s, v2.s[1]\n"
+      "ldr q26, [x15, #0xf0]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v20.4s, v29.4s, v2.s[2]\n"
+      "ldr q29, [x15, #0x100]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v21.4s, v28.4s, v2.s[2]\n"
+      "ldr q28, [x15, #0x110]\n"
+      "fmla v10.4s, v27.4s, v0.s[2]\n"
+      "fmla v16.4s, v27.4s, v1.s[2]\n"
+      "fmla v22.4s, v27.4s, v2.s[2]\n"
+      "ldr q27, [x15, #0x120]\n"
+      "fmla v11.4s, v26.4s, v0.s[2]\n"
+      "fmla v17.4s, v26.4s, v1.s[2]\n"
+      "fmla v23.4s, v26.4s, v2.s[2]\n"
+      "ldr q26, [x15, #0x130]\n"
+      "fmla v12.4s, v29.4s, v0.s[2]\n"
+      "fmla v18.4s, v29.4s, v1.s[2]\n"
+      "fmla v24.4s, v29.4s, v2.s[2]\n"
+      "ldr q29, [x15, #0x140]\n"
+      "fmla v13.4s, v28.4s, v0.s[2]\n"
+      "fmla v19.4s, v28.4s, v1.s[2]\n"
+      "fmla v25.4s, v28.4s, v2.s[2]\n"
+      "ldr q28, [x15, #0x150]\n"
+      "fmla v8.4s, v27.4s, v0.s[3]\n"
+      "fmla v14.4s, v27.4s, v1.s[3]\n"
+      "fmla v20.4s, v27.4s, v2.s[3]\n"
+      "ldr q27, [x15, #0x160]\n"
+      "fmla v9.4s, v26.4s, v0.s[3]\n"
+      "fmla v15.4s, v26.4s, v1.s[3]\n"
+      "fmla v21.4s, v26.4s, v2.s[3]\n"
+      "ldr q26, [x15, #0x170]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
       "add x15, x15, #0x180\n"
-      "fmla v16.4s, v4.4s, v1.s[3]\n"
-      "fmla v22.4s, v4.4s, v2.s[3]\n"
-      "fmla v11.4s, v5.4s, v0.s[3]\n"
-      "fmla v17.4s, v5.4s, v1.s[3]\n"
-      "fmla v23.4s, v5.4s, v2.s[3]\n"
-      "fmla v12.4s, v6.4s, v0.s[3]\n"
-      "fmla v18.4s, v6.4s, v1.s[3]\n"
-      "fmla v24.4s, v6.4s, v2.s[3]\n"
-      "fmla v13.4s, v7.4s, v0.s[3]\n"
-      "fmla v19.4s, v7.4s, v1.s[3]\n"
-      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "fmla v16.4s, v29.4s, v1.s[3]\n"
+      "fmla v22.4s, v29.4s, v2.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "fmla v17.4s, v28.4s, v1.s[3]\n"
+      "fmla v23.4s, v28.4s, v2.s[3]\n"
+      "fmla v12.4s, v27.4s, v0.s[3]\n"
+      "fmla v18.4s, v27.4s, v1.s[3]\n"
+      "fmla v24.4s, v27.4s, v2.s[3]\n"
+      "fmla v13.4s, v26.4s, v0.s[3]\n"
+      "fmla v19.4s, v26.4s, v1.s[3]\n"
+      "fmla v25.4s, v26.4s, v2.s[3]\n"
       "106:"  // Height 3: Multiply loop: Main loop skip
       "cbz x12, 108f\n"
       "107:"  // Height 3: Multiply loop: Odd block loop
       "ldr s0, [x11], #0x4\n"
       "sub x12, x12, #0x1\n"
-      "ldr s1, [x25], #0x4\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr q4, [x15, #0x0]\n"
-      "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "ldr q5, [x15, #0x10]\n"
-      "fmla v14.4s, v4.4s, v1.s[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      "fmla v20.4s, v4.4s, v2.s[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "ldr q4, [x15, #0x40]\n"
-      "fmla v15.4s, v5.4s, v1.s[0]\n"
-      "fmla v21.4s, v5.4s, v2.s[0]\n"
-      "ldr q5, [x15, #0x50]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr s31, [x10], #0x4\n"
+      "ldr s30, [x9], #0x4\n"
+      "ldr q27, [x15, #0x0]\n"
+      "fmla v8.4s, v27.4s, v0.s[0]\n"
+      "ldr q26, [x15, #0x10]\n"
+      "fmla v14.4s, v27.4s, v31.s[0]\n"
+      "ldr q29, [x15, #0x20]\n"
+      "fmla v20.4s, v27.4s, v30.s[0]\n"
+      "ldr q28, [x15, #0x30]\n"
+      "fmla v9.4s, v26.4s, v0.s[0]\n"
+      "ldr q27, [x15, #0x40]\n"
+      "fmla v15.4s, v26.4s, v31.s[0]\n"
+      "fmla v21.4s, v26.4s, v30.s[0]\n"
+      "ldr q26, [x15, #0x50]\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
       "add x15, x15, #0x60\n"
-      "fmla v16.4s, v6.4s, v1.s[0]\n"
-      "fmla v22.4s, v6.4s, v2.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v17.4s, v7.4s, v1.s[0]\n"
-      "fmla v23.4s, v7.4s, v2.s[0]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "fmla v24.4s, v4.4s, v2.s[0]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "fmla v19.4s, v5.4s, v1.s[0]\n"
-      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "fmla v16.4s, v29.4s, v31.s[0]\n"
+      "fmla v22.4s, v29.4s, v30.s[0]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "fmla v17.4s, v28.4s, v31.s[0]\n"
+      "fmla v23.4s, v28.4s, v30.s[0]\n"
+      "fmla v12.4s, v27.4s, v0.s[0]\n"
+      "fmla v18.4s, v27.4s, v31.s[0]\n"
+      "fmla v24.4s, v27.4s, v30.s[0]\n"
+      "fmla v13.4s, v26.4s, v0.s[0]\n"
+      "fmla v19.4s, v26.4s, v31.s[0]\n"
+      "fmla v25.4s, v26.4s, v30.s[0]\n"
       "cbnz x12, 107b\n"
       "108:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1679,45 +1678,45 @@
       "prfm pstl1keep, [x22, #0x0]\n"
       "tbz %x[flags], #1, 109f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
-      "fmin v20.4s, v20.4s, v0.4s\n"
-      "fmin v21.4s, v21.4s, v0.4s\n"
-      "fmin v22.4s, v22.4s, v0.4s\n"
-      "fmin v23.4s, v23.4s, v0.4s\n"
-      "fmin v24.4s, v24.4s, v0.4s\n"
-      "fmin v25.4s, v25.4s, v0.4s\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v15.4s, v15.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmin v20.4s, v20.4s, v26.4s\n"
+      "fmin v21.4s, v21.4s, v26.4s\n"
+      "fmin v22.4s, v22.4s, v26.4s\n"
+      "fmin v23.4s, v23.4s, v26.4s\n"
+      "fmin v24.4s, v24.4s, v26.4s\n"
+      "fmin v25.4s, v25.4s, v26.4s\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
-      "fmax v20.4s, v20.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v0.4s\n"
-      "fmax v22.4s, v22.4s, v0.4s\n"
-      "fmax v23.4s, v23.4s, v0.4s\n"
-      "fmax v24.4s, v24.4s, v0.4s\n"
-      "fmax v25.4s, v25.4s, v0.4s\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v26.4s\n"
+      "fmax v9.4s, v9.4s, v26.4s\n"
+      "fmax v10.4s, v10.4s, v26.4s\n"
+      "fmax v11.4s, v11.4s, v26.4s\n"
+      "fmax v12.4s, v12.4s, v26.4s\n"
+      "fmax v13.4s, v13.4s, v26.4s\n"
+      "fmax v14.4s, v14.4s, v26.4s\n"
+      "fmax v15.4s, v15.4s, v26.4s\n"
+      "fmax v16.4s, v16.4s, v26.4s\n"
+      "fmax v17.4s, v17.4s, v26.4s\n"
+      "fmax v18.4s, v18.4s, v26.4s\n"
+      "fmax v19.4s, v19.4s, v26.4s\n"
+      "fmax v20.4s, v20.4s, v26.4s\n"
+      "fmax v21.4s, v21.4s, v26.4s\n"
+      "fmax v22.4s, v22.4s, v26.4s\n"
+      "fmax v23.4s, v23.4s, v26.4s\n"
+      "fmax v24.4s, v24.4s, v26.4s\n"
+      "fmax v25.4s, v25.4s, v26.4s\n"
       "109:"  // Height 3: No activation
       "cmp x16, #0x18\n"
       "bge 122f\n"
@@ -2139,34 +2138,34 @@
       "142:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w12, [x20, x13, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 143f\n"
-      "ldr x21, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x11, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x23, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x11, [x20, #0x0]\n"
+      "ldr x10, [x20, #0x8]\n"
+      "ldr x9, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
       "cbnz x13, 144f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x11, x11, x20, LSL #2\n"
-      "add x25, x25, x20, LSL #2\n"
-      "add x23, x23, x20, LSL #2\n"
-      "add x21, x21, x20, LSL #2\n"
+      "add x10, x10, x20, LSL #2\n"
+      "add x9, x9, x20, LSL #2\n"
+      "add x28, x28, x20, LSL #2\n"
       "b 144f\n"
       "143:"  // Height 4: setup direct input
       "mov x11, %x[input_ptr]\n"
-      "add x25, x11, x20, LSL #2\n"
-      "add x23, x25, x20, LSL #2\n"
-      "add x21, x23, x20, LSL #2\n"
+      "add x10, x11, x21, LSL #2\n"
+      "add x9, x10, x21, LSL #2\n"
+      "add x28, x9, x21, LSL #2\n"
       "144:"  // Height 4: input setup done
       "cmp x12, #0x4\n"
       "blt 147f\n"
       "ldr q0, [x11, #0x0]\n"
       "cmp x12, #0x8\n"
-      "ldr q1, [x25, #0x0]\n"
-      "ldr q2, [x23, #0x0]\n"
-      "ldr q3, [x21, #0x0]\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q2, [x9, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
       "ldr q4, [x15, #0x0]\n"
       "ldr q5, [x15, #0x10]\n"
       "ldr q6, [x15, #0x20]\n"
@@ -2174,177 +2173,177 @@
       "blt 146f\n"
       "145:"  // Height 4: Multiply loop: Main loop head
       "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "ldr x10, [x15, #0x48]\n"
+      "ldr x23, [x15, #0x48]\n"
       "fmla v14.4s, v4.4s, v1.s[0]\n"
-      "ldr x9, [x15, #0x58]\n"
+      "ldr x22, [x15, #0x58]\n"
       "fmla v20.4s, v4.4s, v2.s[0]\n"
-      "ldr x28, [x15, #0x68]\n"
+      "ldr x21, [x15, #0x68]\n"
       "fmla v26.4s, v4.4s, v3.s[0]\n"
       "ldr d4, [x15, #0x40]\n"
       "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "ldr x27, [x15, #0x78]\n"
+      "ldr x20, [x15, #0x78]\n"
       "fmla v15.4s, v5.4s, v1.s[0]\n"
-      "mov v4.d[1], x10\n"
+      "mov v4.d[1], x23\n"
       "fmla v21.4s, v5.4s, v2.s[0]\n"
-      "ldr x10, [x15, #0x88]\n"
+      "ldr x23, [x15, #0x88]\n"
       "fmla v27.4s, v5.4s, v3.s[0]\n"
       "ldr d5, [x15, #0x50]\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "mov v5.d[1], x9\n"
+      "mov v5.d[1], x22\n"
       "fmla v16.4s, v6.4s, v1.s[0]\n"
-      "ldr x9, [x15, #0x98]\n"
+      "ldr x22, [x15, #0x98]\n"
       "fmla v22.4s, v6.4s, v2.s[0]\n"
       "add x11, x11, #0x10\n"
       "fmla v28.4s, v6.4s, v3.s[0]\n"
       "ldr d6, [x15, #0x60]\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x28\n"
+      "mov v6.d[1], x21\n"
       "fmla v17.4s, v7.4s, v1.s[0]\n"
-      "ldr x28, [x15, #0xa8]\n"
+      "ldr x21, [x15, #0xa8]\n"
       "fmla v23.4s, v7.4s, v2.s[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       "fmla v29.4s, v7.4s, v3.s[0]\n"
       "ldr d7, [x15, #0x70]\n"
-      "mov v7.d[1], x27\n"
+      "mov v7.d[1], x20\n"
       "fmla v12.4s, v4.4s, v0.s[0]\n"
       "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "ldr x27, [x15, #0xb8]\n"
+      "ldr x20, [x15, #0xb8]\n"
       "fmla v24.4s, v4.4s, v2.s[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v30.4s, v4.4s, v3.s[0]\n"
       "ldr d4, [x15, #0x80]\n"
       "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "mov v4.d[1], x10\n"
+      "mov v4.d[1], x23\n"
       "fmla v19.4s, v5.4s, v1.s[0]\n"
-      "ldr x10, [x15, #0xc8]\n"
+      "ldr x23, [x15, #0xc8]\n"
       "fmla v25.4s, v5.4s, v2.s[0]\n"
-      "add x21, x21, #0x10\n"
+      "add x28, x28, #0x10\n"
       "fmla v31.4s, v5.4s, v3.s[0]\n"
       "ldr d5, [x15, #0x90]\n"
       "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "mov v5.d[1], x9\n"
+      "mov v5.d[1], x22\n"
       "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr x9, [x15, #0xd8]\n"
+      "ldr x22, [x15, #0xd8]\n"
       "fmla v20.4s, v6.4s, v2.s[1]\n"
-      "ldr x26, [x11, #0x8]\n"
+      "ldr x27, [x11, #0x8]\n"
       "fmla v26.4s, v6.4s, v3.s[1]\n"
       "ldr d6, [x15, #0xa0]\n"
       "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x28\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr x28, [x15, #0xe8]\n"
+      "ldr x21, [x15, #0xe8]\n"
       "fmla v21.4s, v7.4s, v2.s[1]\n"
-      "ldr x24, [x25, #0x8]\n"
+      "ldr x26, [x10, #0x8]\n"
       "fmla v27.4s, v7.4s, v3.s[1]\n"
       "ldr d7, [x15, #0xb0]\n"
-      "mov v7.d[1], x27\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.4s, v4.4s, v0.s[1]\n"
       "fmla v16.4s, v4.4s, v1.s[1]\n"
-      "ldr x27, [x15, #0xf8]\n"
+      "ldr x20, [x15, #0xf8]\n"
       "fmla v22.4s, v4.4s, v2.s[1]\n"
-      "ldr x22, [x23, #0x8]\n"
+      "ldr x25, [x9, #0x8]\n"
       "fmla v28.4s, v4.4s, v3.s[1]\n"
       "ldr d4, [x15, #0xc0]\n"
       "fmla v11.4s, v5.4s, v0.s[1]\n"
-      "mov v4.d[1], x10\n"
+      "mov v4.d[1], x23\n"
       "fmla v17.4s, v5.4s, v1.s[1]\n"
-      "ldr x10, [x15, #0x108]\n"
+      "ldr x23, [x15, #0x108]\n"
       "fmla v23.4s, v5.4s, v2.s[1]\n"
-      "ldr x20, [x21, #0x8]\n"
+      "ldr x24, [x28, #0x8]\n"
       "fmla v29.4s, v5.4s, v3.s[1]\n"
       "ldr d5, [x15, #0xd0]\n"
       "fmla v12.4s, v6.4s, v0.s[1]\n"
-      "mov v5.d[1], x9\n"
+      "mov v5.d[1], x22\n"
       "fmla v18.4s, v6.4s, v1.s[1]\n"
-      "ldr x9, [x15, #0x118]\n"
+      "ldr x22, [x15, #0x118]\n"
       "fmla v24.4s, v6.4s, v2.s[1]\n"
       "sub x12, x12, #0x4\n"
       "fmla v30.4s, v6.4s, v3.s[1]\n"
       "ldr d6, [x15, #0xe0]\n"
       "fmla v13.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x28\n"
+      "mov v6.d[1], x21\n"
       "fmla v19.4s, v7.4s, v1.s[1]\n"
-      "ldr x28, [x15, #0x128]\n"
+      "ldr x21, [x15, #0x128]\n"
       "fmla v25.4s, v7.4s, v2.s[1]\n"
       "cmp x12, #0x8\n"
       "fmla v31.4s, v7.4s, v3.s[1]\n"
       "ldr d7, [x15, #0xf0]\n"
-      "mov v7.d[1], x27\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.4s, v4.4s, v0.s[2]\n"
       "fmla v14.4s, v4.4s, v1.s[2]\n"
-      "ldr x27, [x15, #0x138]\n"
+      "ldr x20, [x15, #0x138]\n"
       "fmla v20.4s, v4.4s, v2.s[2]\n"
       "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v26.4s, v4.4s, v3.s[2]\n"
       "ldr d4, [x15, #0x100]\n"
       "fmla v9.4s, v5.4s, v0.s[2]\n"
-      "mov v4.d[1], x10\n"
+      "mov v4.d[1], x23\n"
       "fmla v15.4s, v5.4s, v1.s[2]\n"
-      "ldr x10, [x15, #0x148]\n"
+      "ldr x23, [x15, #0x148]\n"
       "fmla v21.4s, v5.4s, v2.s[2]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       "fmla v27.4s, v5.4s, v3.s[2]\n"
       "ldr d5, [x15, #0x110]\n"
       "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "mov v5.d[1], x9\n"
+      "mov v5.d[1], x22\n"
       "fmla v16.4s, v6.4s, v1.s[2]\n"
-      "ldr x9, [x15, #0x158]\n"
+      "ldr x22, [x15, #0x158]\n"
       "fmla v22.4s, v6.4s, v2.s[2]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "fmla v28.4s, v6.4s, v3.s[2]\n"
       "ldr d6, [x15, #0x120]\n"
       "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "mov v6.d[1], x28\n"
+      "mov v6.d[1], x21\n"
       "fmla v17.4s, v7.4s, v1.s[2]\n"
-      "ldr x28, [x15, #0x168]\n"
+      "ldr x21, [x15, #0x168]\n"
       "fmla v23.4s, v7.4s, v2.s[2]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v29.4s, v7.4s, v3.s[2]\n"
       "ldr d7, [x15, #0x130]\n"
-      "mov v7.d[1], x27\n"
+      "mov v7.d[1], x20\n"
       "fmla v12.4s, v4.4s, v0.s[2]\n"
       "fmla v18.4s, v4.4s, v1.s[2]\n"
-      "ldr x27, [x15, #0x178]\n"
+      "ldr x20, [x15, #0x178]\n"
       "fmla v24.4s, v4.4s, v2.s[2]\n"
       "fmla v30.4s, v4.4s, v3.s[2]\n"
       "ldr d4, [x15, #0x140]\n"
       "fmla v13.4s, v5.4s, v0.s[2]\n"
-      "mov v4.d[1], x10\n"
+      "mov v4.d[1], x23\n"
       "fmla v19.4s, v5.4s, v1.s[2]\n"
       "fmla v25.4s, v5.4s, v2.s[2]\n"
       "fmla v31.4s, v5.4s, v3.s[2]\n"
       "ldr d5, [x15, #0x150]\n"
       "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "mov v5.d[1], x9\n"
+      "mov v5.d[1], x22\n"
       "fmla v14.4s, v6.4s, v1.s[3]\n"
       "fmla v20.4s, v6.4s, v2.s[3]\n"
       "fmla v26.4s, v6.4s, v3.s[3]\n"
       "ldr d6, [x15, #0x160]\n"
       "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "mov v6.d[1], x28\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.4s, v7.4s, v1.s[3]\n"
       "fmla v21.4s, v7.4s, v2.s[3]\n"
       "fmla v27.4s, v7.4s, v3.s[3]\n"
       "ldr d7, [x15, #0x170]\n"
-      "mov v7.d[1], x27\n"
+      "mov v7.d[1], x20\n"
       "add x15, x15, #0x180\n"
       "fmla v10.4s, v4.4s, v0.s[3]\n"
-      "ldr x10, [x15, #0x8]\n"
+      "ldr x23, [x15, #0x8]\n"
       "fmla v16.4s, v4.4s, v1.s[3]\n"
-      "ldr x9, [x15, #0x18]\n"
+      "ldr x22, [x15, #0x18]\n"
       "fmla v22.4s, v4.4s, v2.s[3]\n"
-      "ldr x28, [x15, #0x28]\n"
+      "ldr x21, [x15, #0x28]\n"
       "fmla v28.4s, v4.4s, v3.s[3]\n"
       "ldr d4, [x15, #0x0]\n"
       "fmla v11.4s, v5.4s, v0.s[3]\n"
-      "ldr x27, [x15, #0x38]\n"
+      "ldr x20, [x15, #0x38]\n"
       "fmla v17.4s, v5.4s, v1.s[3]\n"
-      "mov v4.d[1], x10\n"
+      "mov v4.d[1], x23\n"
       "fmla v23.4s, v5.4s, v2.s[3]\n"
       "fmla v29.4s, v5.4s, v3.s[3]\n"
       "ldr d5, [x15, #0x10]\n"
       "fmla v12.4s, v6.4s, v0.s[3]\n"
-      "mov v5.d[1], x9\n"
+      "mov v5.d[1], x22\n"
       "fmla v18.4s, v6.4s, v1.s[3]\n"
       "fmla v24.4s, v6.4s, v2.s[3]\n"
       "fmla v30.4s, v6.4s, v3.s[3]\n"
@@ -2352,30 +2351,30 @@
       "fmla v13.4s, v7.4s, v0.s[3]\n"
       "ldr d0, [x11, #0x0]\n"
       "fmla v19.4s, v7.4s, v1.s[3]\n"
-      "ldr d1, [x25, #0x0]\n"
+      "ldr d1, [x10, #0x0]\n"
       "fmla v25.4s, v7.4s, v2.s[3]\n"
-      "ldr d2, [x23, #0x0]\n"
+      "ldr d2, [x9, #0x0]\n"
       "fmla v31.4s, v7.4s, v3.s[3]\n"
-      "ldr d3, [x21, #0x0]\n"
+      "ldr d3, [x28, #0x0]\n"
       "ldr d7, [x15, #0x30]\n"
-      "mov v6.d[1], x28\n"
-      "mov v0.d[1], x26\n"
-      "mov v1.d[1], x24\n"
-      "mov v2.d[1], x22\n"
-      "mov v3.d[1], x20\n"
-      "mov v7.d[1], x27\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x27\n"
+      "mov v1.d[1], x26\n"
+      "mov v2.d[1], x25\n"
+      "mov v3.d[1], x24\n"
+      "mov v7.d[1], x20\n"
       "bge 145b\n"
       "146:"  // Height 4: Multiply loop: Single iteration only
       "fmla v8.4s, v4.4s, v0.s[0]\n"
       "add x11, x11, #0x10\n"
       "fmla v14.4s, v4.4s, v1.s[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       "fmla v20.4s, v4.4s, v2.s[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v26.4s, v4.4s, v3.s[0]\n"
       "ldr q4, [x15, #0x40]\n"
       "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "add x21, x21, #0x10\n"
+      "add x28, x28, #0x10\n"
       "fmla v15.4s, v5.4s, v1.s[0]\n"
       "sub x12, x12, #0x4\n"
       "fmla v21.4s, v5.4s, v2.s[0]\n"
@@ -2383,11 +2382,11 @@
       "fmla v27.4s, v5.4s, v3.s[0]\n"
       "ldr q5, [x15, #0x50]\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       "fmla v16.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "fmla v22.4s, v6.4s, v2.s[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v28.4s, v6.4s, v3.s[0]\n"
       "ldr q6, [x15, #0x60]\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
@@ -2495,42 +2494,42 @@
       "147:"  // Height 4: Multiply loop: Main loop skip
       "cbz x12, 149f\n"
       "148:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
+      "ldr s7, [x11], #0x4\n"
       "sub x12, x12, #0x1\n"
-      "ldr s1, [x25], #0x4\n"
-      "ldr s2, [x23], #0x4\n"
-      "ldr s3, [x21], #0x4\n"
-      "ldr q4, [x15, #0x0]\n"
-      "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "ldr q5, [x15, #0x10]\n"
-      "fmla v14.4s, v4.4s, v1.s[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      "fmla v20.4s, v4.4s, v2.s[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      "fmla v26.4s, v4.4s, v3.s[0]\n"
-      "ldr q4, [x15, #0x40]\n"
-      "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "fmla v15.4s, v5.4s, v1.s[0]\n"
-      "fmla v21.4s, v5.4s, v2.s[0]\n"
-      "fmla v27.4s, v5.4s, v3.s[0]\n"
-      "ldr q5, [x15, #0x50]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr s6, [x10], #0x4\n"
+      "ldr s5, [x9], #0x4\n"
+      "ldr s4, [x28], #0x4\n"
+      "ldr q1, [x15, #0x0]\n"
+      "fmla v8.4s, v1.4s, v7.s[0]\n"
+      "ldr q0, [x15, #0x10]\n"
+      "fmla v14.4s, v1.4s, v6.s[0]\n"
+      "ldr q3, [x15, #0x20]\n"
+      "fmla v20.4s, v1.4s, v5.s[0]\n"
+      "ldr q2, [x15, #0x30]\n"
+      "fmla v26.4s, v1.4s, v4.s[0]\n"
+      "ldr q1, [x15, #0x40]\n"
+      "fmla v9.4s, v0.4s, v7.s[0]\n"
+      "fmla v15.4s, v0.4s, v6.s[0]\n"
+      "fmla v21.4s, v0.4s, v5.s[0]\n"
+      "fmla v27.4s, v0.4s, v4.s[0]\n"
+      "ldr q0, [x15, #0x50]\n"
+      "fmla v10.4s, v3.4s, v7.s[0]\n"
       "add x15, x15, #0x60\n"
-      "fmla v16.4s, v6.4s, v1.s[0]\n"
-      "fmla v22.4s, v6.4s, v2.s[0]\n"
-      "fmla v28.4s, v6.4s, v3.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v17.4s, v7.4s, v1.s[0]\n"
-      "fmla v23.4s, v7.4s, v2.s[0]\n"
-      "fmla v29.4s, v7.4s, v3.s[0]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "fmla v24.4s, v4.4s, v2.s[0]\n"
-      "fmla v30.4s, v4.4s, v3.s[0]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "fmla v19.4s, v5.4s, v1.s[0]\n"
-      "fmla v25.4s, v5.4s, v2.s[0]\n"
-      "fmla v31.4s, v5.4s, v3.s[0]\n"
+      "fmla v16.4s, v3.4s, v6.s[0]\n"
+      "fmla v22.4s, v3.4s, v5.s[0]\n"
+      "fmla v28.4s, v3.4s, v4.s[0]\n"
+      "fmla v11.4s, v2.4s, v7.s[0]\n"
+      "fmla v17.4s, v2.4s, v6.s[0]\n"
+      "fmla v23.4s, v2.4s, v5.s[0]\n"
+      "fmla v29.4s, v2.4s, v4.s[0]\n"
+      "fmla v12.4s, v1.4s, v7.s[0]\n"
+      "fmla v18.4s, v1.4s, v6.s[0]\n"
+      "fmla v24.4s, v1.4s, v5.s[0]\n"
+      "fmla v30.4s, v1.4s, v4.s[0]\n"
+      "fmla v13.4s, v0.4s, v7.s[0]\n"
+      "fmla v19.4s, v0.4s, v6.s[0]\n"
+      "fmla v25.4s, v0.4s, v5.s[0]\n"
+      "fmla v31.4s, v0.4s, v4.s[0]\n"
       "cbnz x12, 148b\n"
       "149:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2796,7 +2795,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "166:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp
index 5fb71c9..dbd4546 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp
@@ -92,7 +92,6 @@
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x4\n"
       "bge 124f\n"
@@ -223,11 +222,11 @@
       "19:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
       "cbnz x26, 21f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -246,126 +245,126 @@
       "blt 23f\n"
       "22:"  // Height 1: Multiply loop: Main loop head
       "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "ldr q4, [x28, #0x40]\n"
+      "ldr q19, [x28, #0x40]\n"
       "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "ldr q5, [x28, #0x50]\n"
+      "ldr q18, [x28, #0x50]\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q6, [x28, #0x60]\n"
+      "ldr q17, [x28, #0x60]\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x28, #0x70]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "ldr q4, [x28, #0x80]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "ldr q5, [x28, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      "fmla v10.4s, v4.4s, v0.s[1]\n"
-      "ldr q4, [x28, #0xc0]\n"
-      "fmla v11.4s, v5.4s, v0.s[1]\n"
-      "ldr q5, [x28, #0xd0]\n"
-      "fmla v12.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      "fmla v13.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "fmla v8.4s, v4.4s, v0.s[2]\n"
-      "ldr q4, [x28, #0x100]\n"
-      "fmla v9.4s, v5.4s, v0.s[2]\n"
-      "ldr q5, [x28, #0x110]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x28, #0x120]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x28, #0x130]\n"
-      "fmla v12.4s, v4.4s, v0.s[2]\n"
-      "ldr q4, [x28, #0x140]\n"
-      "fmla v13.4s, v5.4s, v0.s[2]\n"
-      "ldr q5, [x28, #0x150]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr q6, [x28, #0x160]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr q7, [x28, #0x170]\n"
+      "ldr q16, [x28, #0x70]\n"
+      "fmla v12.4s, v19.4s, v0.s[0]\n"
+      "ldr q19, [x28, #0x80]\n"
+      "fmla v13.4s, v18.4s, v0.s[0]\n"
+      "ldr q18, [x28, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x28, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x28, #0xb0]\n"
+      "fmla v10.4s, v19.4s, v0.s[1]\n"
+      "ldr q19, [x28, #0xc0]\n"
+      "fmla v11.4s, v18.4s, v0.s[1]\n"
+      "ldr q18, [x28, #0xd0]\n"
+      "fmla v12.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x28, #0xe0]\n"
+      "fmla v13.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x28, #0xf0]\n"
+      "fmla v8.4s, v19.4s, v0.s[2]\n"
+      "ldr q19, [x28, #0x100]\n"
+      "fmla v9.4s, v18.4s, v0.s[2]\n"
+      "ldr q18, [x28, #0x110]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x28, #0x120]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x28, #0x130]\n"
+      "fmla v12.4s, v19.4s, v0.s[2]\n"
+      "ldr q19, [x28, #0x140]\n"
+      "fmla v13.4s, v18.4s, v0.s[2]\n"
+      "ldr q18, [x28, #0x150]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x28, #0x160]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x28, #0x170]\n"
       "sub x25, x25, #0x4\n"
       "add x24, x24, #0x10\n"
-      "fmla v10.4s, v4.4s, v0.s[3]\n"
-      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v10.4s, v19.4s, v0.s[3]\n"
+      "fmla v11.4s, v18.4s, v0.s[3]\n"
       "cmp x25, #0x8\n"
       "add x28, x28, #0x180\n"
       "ldr q4, [x28, #0x0]\n"
       "ldr q5, [x28, #0x10]\n"
-      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v17.4s, v0.s[3]\n"
       "ldr q6, [x28, #0x20]\n"
-      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v0.s[3]\n"
       "ldr q0, [x24, #0x0]\n"
       "ldr q7, [x28, #0x30]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "bge 22b\n"
       "23:"  // Height 1: Multiply loop: Single iteration only
       "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "ldr q4, [x28, #0x40]\n"
+      "ldr q19, [x28, #0x40]\n"
       "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "ldr q5, [x28, #0x50]\n"
+      "ldr q18, [x28, #0x50]\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q6, [x28, #0x60]\n"
+      "ldr q17, [x28, #0x60]\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x28, #0x70]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "ldr q4, [x28, #0x80]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "ldr q5, [x28, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      "fmla v10.4s, v4.4s, v0.s[1]\n"
-      "ldr q4, [x28, #0xc0]\n"
-      "fmla v11.4s, v5.4s, v0.s[1]\n"
-      "ldr q5, [x28, #0xd0]\n"
-      "fmla v12.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      "fmla v13.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "fmla v8.4s, v4.4s, v0.s[2]\n"
-      "ldr q4, [x28, #0x100]\n"
-      "fmla v9.4s, v5.4s, v0.s[2]\n"
-      "ldr q5, [x28, #0x110]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x28, #0x120]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x28, #0x130]\n"
-      "fmla v12.4s, v4.4s, v0.s[2]\n"
-      "ldr q4, [x28, #0x140]\n"
-      "fmla v13.4s, v5.4s, v0.s[2]\n"
-      "ldr q5, [x28, #0x150]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr q6, [x28, #0x160]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr q7, [x28, #0x170]\n"
+      "ldr q16, [x28, #0x70]\n"
+      "fmla v12.4s, v19.4s, v0.s[0]\n"
+      "ldr q19, [x28, #0x80]\n"
+      "fmla v13.4s, v18.4s, v0.s[0]\n"
+      "ldr q18, [x28, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x28, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x28, #0xb0]\n"
+      "fmla v10.4s, v19.4s, v0.s[1]\n"
+      "ldr q19, [x28, #0xc0]\n"
+      "fmla v11.4s, v18.4s, v0.s[1]\n"
+      "ldr q18, [x28, #0xd0]\n"
+      "fmla v12.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x28, #0xe0]\n"
+      "fmla v13.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x28, #0xf0]\n"
+      "fmla v8.4s, v19.4s, v0.s[2]\n"
+      "ldr q19, [x28, #0x100]\n"
+      "fmla v9.4s, v18.4s, v0.s[2]\n"
+      "ldr q18, [x28, #0x110]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x28, #0x120]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x28, #0x130]\n"
+      "fmla v12.4s, v19.4s, v0.s[2]\n"
+      "ldr q19, [x28, #0x140]\n"
+      "fmla v13.4s, v18.4s, v0.s[2]\n"
+      "ldr q18, [x28, #0x150]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x28, #0x160]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x28, #0x170]\n"
       "add x24, x24, #0x10\n"
       "sub x25, x25, #0x4\n"
-      "fmla v10.4s, v4.4s, v0.s[3]\n"
-      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v10.4s, v19.4s, v0.s[3]\n"
+      "fmla v11.4s, v18.4s, v0.s[3]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "add x28, x28, #0x180\n"
-      "fmla v12.4s, v6.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v12.4s, v17.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v0.s[3]\n"
       "24:"  // Height 1: Multiply loop: Main loop skip
       "cbz x25, 26f\n"
       "25:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x24], #0x4\n"
-      "ldr q4, [x28, #0x0]\n"
-      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr s18, [x24], #0x4\n"
+      "ldr q16, [x28, #0x0]\n"
+      "fmla v8.4s, v16.4s, v18.s[0]\n"
       "sub x25, x25, #0x1\n"
-      "ldr q5, [x28, #0x10]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "ldr q4, [x28, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "ldr q5, [x28, #0x50]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "ldr q17, [x28, #0x10]\n"
+      "ldr q16, [x28, #0x20]\n"
+      "fmla v9.4s, v17.4s, v18.s[0]\n"
+      "fmla v10.4s, v16.4s, v18.s[0]\n"
+      "ldr q17, [x28, #0x30]\n"
+      "ldr q16, [x28, #0x40]\n"
+      "fmla v11.4s, v17.4s, v18.s[0]\n"
+      "fmla v12.4s, v16.4s, v18.s[0]\n"
+      "ldr q16, [x28, #0x50]\n"
+      "fmla v13.4s, v16.4s, v18.s[0]\n"
       "add x28, x28, #0x60\n"
       "cbnz x25, 25b\n"
       "26:"  // Height 1: Multiply loop: No odd multiplies
@@ -376,21 +375,21 @@
       "prfm pstl1keep, [x27, #0x0]\n"
       "tbz %x[flags], #1, 27f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v17.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v17.4s\n"
+      "fmin v9.4s, v9.4s, v17.4s\n"
+      "fmin v10.4s, v10.4s, v17.4s\n"
+      "fmin v11.4s, v11.4s, v17.4s\n"
+      "fmin v12.4s, v12.4s, v17.4s\n"
+      "fmin v13.4s, v13.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
+      "fmax v12.4s, v12.4s, v16.4s\n"
+      "fmax v13.4s, v13.4s, v16.4s\n"
       "27:"  // Height 1: No activation
       "cmp x9, #0x18\n"
       "bge 40f\n"
@@ -651,12 +650,12 @@
       "60:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 61f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
       "cbnz x26, 62f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -664,7 +663,7 @@
       "b 62f\n"
       "61:"  // Height 2: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "62:"  // Height 2: input setup done
       "cmp x25, #0x4\n"
       "blt 65f\n"
@@ -679,186 +678,186 @@
       "63:"  // Height 2: Multiply loop: Main loop head
       "fmla v8.4s, v4.4s, v0.s[0]\n"
       "fmla v14.4s, v4.4s, v1.s[0]\n"
-      "ldr q4, [x28, #0x40]\n"
+      "ldr q23, [x28, #0x40]\n"
       "sub x25, x25, #0x4\n"
       "fmla v9.4s, v5.4s, v0.s[0]\n"
       "fmla v15.4s, v5.4s, v1.s[0]\n"
-      "ldr q5, [x28, #0x50]\n"
+      "ldr q22, [x28, #0x50]\n"
       "add x24, x24, #0x10\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
       "fmla v16.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x28, #0x60]\n"
+      "ldr q21, [x28, #0x60]\n"
       "add x23, x23, #0x10\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
       "fmla v17.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x28, #0x70]\n"
+      "ldr q20, [x28, #0x70]\n"
       "cmp x25, #0x8\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "ldr q4, [x28, #0x80]\n"
+      "fmla v12.4s, v23.4s, v0.s[0]\n"
+      "fmla v18.4s, v23.4s, v1.s[0]\n"
+      "ldr q23, [x28, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "fmla v19.4s, v5.4s, v1.s[0]\n"
-      "ldr q5, [x28, #0x90]\n"
+      "fmla v13.4s, v22.4s, v0.s[0]\n"
+      "fmla v19.4s, v22.4s, v1.s[0]\n"
+      "ldr q22, [x28, #0x90]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      "fmla v10.4s, v4.4s, v0.s[1]\n"
-      "fmla v16.4s, v4.4s, v1.s[1]\n"
-      "ldr q4, [x28, #0xc0]\n"
-      "fmla v11.4s, v5.4s, v0.s[1]\n"
-      "fmla v17.4s, v5.4s, v1.s[1]\n"
-      "ldr q5, [x28, #0xd0]\n"
-      "fmla v12.4s, v6.4s, v0.s[1]\n"
-      "fmla v18.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      "fmla v13.4s, v7.4s, v0.s[1]\n"
-      "fmla v19.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "fmla v8.4s, v4.4s, v0.s[2]\n"
-      "fmla v14.4s, v4.4s, v1.s[2]\n"
-      "ldr q4, [x28, #0x100]\n"
-      "fmla v9.4s, v5.4s, v0.s[2]\n"
-      "fmla v15.4s, v5.4s, v1.s[2]\n"
-      "ldr q5, [x28, #0x110]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v16.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x28, #0x120]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v17.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x28, #0x130]\n"
-      "fmla v12.4s, v4.4s, v0.s[2]\n"
-      "fmla v18.4s, v4.4s, v1.s[2]\n"
-      "ldr q4, [x28, #0x140]\n"
-      "fmla v13.4s, v5.4s, v0.s[2]\n"
-      "fmla v19.4s, v5.4s, v1.s[2]\n"
-      "ldr q5, [x28, #0x150]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "ldr q6, [x28, #0x160]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr q7, [x28, #0x170]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "ldr q21, [x28, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "ldr q20, [x28, #0xb0]\n"
+      "fmla v10.4s, v23.4s, v0.s[1]\n"
+      "fmla v16.4s, v23.4s, v1.s[1]\n"
+      "ldr q23, [x28, #0xc0]\n"
+      "fmla v11.4s, v22.4s, v0.s[1]\n"
+      "fmla v17.4s, v22.4s, v1.s[1]\n"
+      "ldr q22, [x28, #0xd0]\n"
+      "fmla v12.4s, v21.4s, v0.s[1]\n"
+      "fmla v18.4s, v21.4s, v1.s[1]\n"
+      "ldr q21, [x28, #0xe0]\n"
+      "fmla v13.4s, v20.4s, v0.s[1]\n"
+      "fmla v19.4s, v20.4s, v1.s[1]\n"
+      "ldr q20, [x28, #0xf0]\n"
+      "fmla v8.4s, v23.4s, v0.s[2]\n"
+      "fmla v14.4s, v23.4s, v1.s[2]\n"
+      "ldr q23, [x28, #0x100]\n"
+      "fmla v9.4s, v22.4s, v0.s[2]\n"
+      "fmla v15.4s, v22.4s, v1.s[2]\n"
+      "ldr q22, [x28, #0x110]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v16.4s, v21.4s, v1.s[2]\n"
+      "ldr q21, [x28, #0x120]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "fmla v17.4s, v20.4s, v1.s[2]\n"
+      "ldr q20, [x28, #0x130]\n"
+      "fmla v12.4s, v23.4s, v0.s[2]\n"
+      "fmla v18.4s, v23.4s, v1.s[2]\n"
+      "ldr q23, [x28, #0x140]\n"
+      "fmla v13.4s, v22.4s, v0.s[2]\n"
+      "fmla v19.4s, v22.4s, v1.s[2]\n"
+      "ldr q22, [x28, #0x150]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "ldr q21, [x28, #0x160]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "ldr q20, [x28, #0x170]\n"
       "add x28, x28, #0x180\n"
-      "fmla v10.4s, v4.4s, v0.s[3]\n"
-      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "fmla v10.4s, v23.4s, v0.s[3]\n"
+      "fmla v16.4s, v23.4s, v1.s[3]\n"
       "ldr q4, [x28, #0x0]\n"
-      "fmla v11.4s, v5.4s, v0.s[3]\n"
-      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "fmla v11.4s, v22.4s, v0.s[3]\n"
+      "fmla v17.4s, v22.4s, v1.s[3]\n"
       "ldr q5, [x28, #0x10]\n"
-      "fmla v12.4s, v6.4s, v0.s[3]\n"
-      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v12.4s, v21.4s, v0.s[3]\n"
+      "fmla v18.4s, v21.4s, v1.s[3]\n"
       "ldr q6, [x28, #0x20]\n"
-      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v20.4s, v0.s[3]\n"
       "ldr q0, [x24, #0x0]\n"
-      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v20.4s, v1.s[3]\n"
       "ldr q1, [x23, #0x0]\n"
       "ldr q7, [x28, #0x30]\n"
       "bge 63b\n"
       "64:"  // Height 2: Multiply loop: Single iteration only
       "fmla v8.4s, v4.4s, v0.s[0]\n"
       "fmla v14.4s, v4.4s, v1.s[0]\n"
-      "ldr q4, [x28, #0x40]\n"
+      "ldr q23, [x28, #0x40]\n"
       "add x24, x24, #0x10\n"
       "fmla v9.4s, v5.4s, v0.s[0]\n"
       "fmla v15.4s, v5.4s, v1.s[0]\n"
-      "ldr q5, [x28, #0x50]\n"
+      "ldr q22, [x28, #0x50]\n"
       "add x23, x23, #0x10\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
       "fmla v16.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x28, #0x60]\n"
+      "ldr q21, [x28, #0x60]\n"
       "sub x25, x25, #0x4\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
       "fmla v17.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x28, #0x70]\n"
+      "ldr q20, [x28, #0x70]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "ldr q4, [x28, #0x80]\n"
+      "fmla v12.4s, v23.4s, v0.s[0]\n"
+      "fmla v18.4s, v23.4s, v1.s[0]\n"
+      "ldr q23, [x28, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "fmla v19.4s, v5.4s, v1.s[0]\n"
-      "ldr q5, [x28, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      "fmla v10.4s, v4.4s, v0.s[1]\n"
-      "fmla v16.4s, v4.4s, v1.s[1]\n"
-      "ldr q4, [x28, #0xc0]\n"
-      "fmla v11.4s, v5.4s, v0.s[1]\n"
-      "fmla v17.4s, v5.4s, v1.s[1]\n"
-      "ldr q5, [x28, #0xd0]\n"
-      "fmla v12.4s, v6.4s, v0.s[1]\n"
-      "fmla v18.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      "fmla v13.4s, v7.4s, v0.s[1]\n"
-      "fmla v19.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "fmla v8.4s, v4.4s, v0.s[2]\n"
-      "fmla v14.4s, v4.4s, v1.s[2]\n"
-      "ldr q4, [x28, #0x100]\n"
-      "fmla v9.4s, v5.4s, v0.s[2]\n"
-      "fmla v15.4s, v5.4s, v1.s[2]\n"
-      "ldr q5, [x28, #0x110]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v16.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x28, #0x120]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v17.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x28, #0x130]\n"
-      "fmla v12.4s, v4.4s, v0.s[2]\n"
-      "fmla v18.4s, v4.4s, v1.s[2]\n"
-      "ldr q4, [x28, #0x140]\n"
-      "fmla v13.4s, v5.4s, v0.s[2]\n"
-      "fmla v19.4s, v5.4s, v1.s[2]\n"
-      "ldr q5, [x28, #0x150]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "ldr q6, [x28, #0x160]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr q7, [x28, #0x170]\n"
+      "fmla v13.4s, v22.4s, v0.s[0]\n"
+      "fmla v19.4s, v22.4s, v1.s[0]\n"
+      "ldr q22, [x28, #0x90]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "ldr q21, [x28, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "ldr q20, [x28, #0xb0]\n"
+      "fmla v10.4s, v23.4s, v0.s[1]\n"
+      "fmla v16.4s, v23.4s, v1.s[1]\n"
+      "ldr q23, [x28, #0xc0]\n"
+      "fmla v11.4s, v22.4s, v0.s[1]\n"
+      "fmla v17.4s, v22.4s, v1.s[1]\n"
+      "ldr q22, [x28, #0xd0]\n"
+      "fmla v12.4s, v21.4s, v0.s[1]\n"
+      "fmla v18.4s, v21.4s, v1.s[1]\n"
+      "ldr q21, [x28, #0xe0]\n"
+      "fmla v13.4s, v20.4s, v0.s[1]\n"
+      "fmla v19.4s, v20.4s, v1.s[1]\n"
+      "ldr q20, [x28, #0xf0]\n"
+      "fmla v8.4s, v23.4s, v0.s[2]\n"
+      "fmla v14.4s, v23.4s, v1.s[2]\n"
+      "ldr q23, [x28, #0x100]\n"
+      "fmla v9.4s, v22.4s, v0.s[2]\n"
+      "fmla v15.4s, v22.4s, v1.s[2]\n"
+      "ldr q22, [x28, #0x110]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v16.4s, v21.4s, v1.s[2]\n"
+      "ldr q21, [x28, #0x120]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "fmla v17.4s, v20.4s, v1.s[2]\n"
+      "ldr q20, [x28, #0x130]\n"
+      "fmla v12.4s, v23.4s, v0.s[2]\n"
+      "fmla v18.4s, v23.4s, v1.s[2]\n"
+      "ldr q23, [x28, #0x140]\n"
+      "fmla v13.4s, v22.4s, v0.s[2]\n"
+      "fmla v19.4s, v22.4s, v1.s[2]\n"
+      "ldr q22, [x28, #0x150]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "ldr q21, [x28, #0x160]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "ldr q20, [x28, #0x170]\n"
       "add x28, x28, #0x180\n"
-      "fmla v10.4s, v4.4s, v0.s[3]\n"
-      "fmla v16.4s, v4.4s, v1.s[3]\n"
-      "fmla v11.4s, v5.4s, v0.s[3]\n"
-      "fmla v17.4s, v5.4s, v1.s[3]\n"
-      "fmla v12.4s, v6.4s, v0.s[3]\n"
-      "fmla v18.4s, v6.4s, v1.s[3]\n"
-      "fmla v13.4s, v7.4s, v0.s[3]\n"
-      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "fmla v10.4s, v23.4s, v0.s[3]\n"
+      "fmla v16.4s, v23.4s, v1.s[3]\n"
+      "fmla v11.4s, v22.4s, v0.s[3]\n"
+      "fmla v17.4s, v22.4s, v1.s[3]\n"
+      "fmla v12.4s, v21.4s, v0.s[3]\n"
+      "fmla v18.4s, v21.4s, v1.s[3]\n"
+      "fmla v13.4s, v20.4s, v0.s[3]\n"
+      "fmla v19.4s, v20.4s, v1.s[3]\n"
       "65:"  // Height 2: Multiply loop: Main loop skip
       "cbz x25, 67f\n"
       "66:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x24], #0x4\n"
-      "ldr s1, [x23], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
       "sub x25, x25, #0x1\n"
-      "ldr q4, [x28, #0x0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "fmla v14.4s, v4.4s, v1.s[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "fmla v15.4s, v5.4s, v1.s[0]\n"
-      "ldr q4, [x28, #0x40]\n"
-      "ldr q5, [x28, #0x50]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v16.4s, v6.4s, v1.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "ldr q21, [x28, #0x0]\n"
+      "ldr q20, [x28, #0x10]\n"
+      "fmla v8.4s, v21.4s, v25.s[0]\n"
+      "fmla v14.4s, v21.4s, v24.s[0]\n"
+      "ldr q23, [x28, #0x20]\n"
+      "ldr q22, [x28, #0x30]\n"
+      "fmla v9.4s, v20.4s, v25.s[0]\n"
+      "fmla v15.4s, v20.4s, v24.s[0]\n"
+      "ldr q21, [x28, #0x40]\n"
+      "ldr q20, [x28, #0x50]\n"
+      "fmla v10.4s, v23.4s, v25.s[0]\n"
+      "fmla v16.4s, v23.4s, v24.s[0]\n"
+      "fmla v11.4s, v22.4s, v25.s[0]\n"
+      "fmla v17.4s, v22.4s, v24.s[0]\n"
       "add x28, x28, #0x60\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v12.4s, v21.4s, v25.s[0]\n"
+      "fmla v18.4s, v21.4s, v24.s[0]\n"
+      "fmla v13.4s, v20.4s, v25.s[0]\n"
+      "fmla v19.4s, v20.4s, v24.s[0]\n"
       "cbnz x25, 66b\n"
       "67:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -871,33 +870,33 @@
       "prfm pstl1keep, [x23, #0x0]\n"
       "tbz %x[flags], #1, 68f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v21.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v21.4s\n"
+      "fmin v9.4s, v9.4s, v21.4s\n"
+      "fmin v10.4s, v10.4s, v21.4s\n"
+      "fmin v11.4s, v11.4s, v21.4s\n"
+      "fmin v12.4s, v12.4s, v21.4s\n"
+      "fmin v13.4s, v13.4s, v21.4s\n"
+      "fmin v14.4s, v14.4s, v21.4s\n"
+      "fmin v15.4s, v15.4s, v21.4s\n"
+      "fmin v16.4s, v16.4s, v21.4s\n"
+      "fmin v17.4s, v17.4s, v21.4s\n"
+      "fmin v18.4s, v18.4s, v21.4s\n"
+      "fmin v19.4s, v19.4s, v21.4s\n"
+      "fmax v8.4s, v8.4s, v20.4s\n"
+      "fmax v9.4s, v9.4s, v20.4s\n"
+      "fmax v10.4s, v10.4s, v20.4s\n"
+      "fmax v11.4s, v11.4s, v20.4s\n"
+      "fmax v12.4s, v12.4s, v20.4s\n"
+      "fmax v13.4s, v13.4s, v20.4s\n"
+      "fmax v14.4s, v14.4s, v20.4s\n"
+      "fmax v15.4s, v15.4s, v20.4s\n"
+      "fmax v16.4s, v16.4s, v20.4s\n"
+      "fmax v17.4s, v17.4s, v20.4s\n"
+      "fmax v18.4s, v18.4s, v20.4s\n"
+      "fmax v19.4s, v19.4s, v20.4s\n"
       "68:"  // Height 2: No activation
       "cmp x9, #0x18\n"
       "bge 81f\n"
@@ -1237,13 +1236,13 @@
       "101:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 102f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
       "cbnz x26, 103f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -1252,8 +1251,8 @@
       "b 103f\n"
       "102:"  // Height 3: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "103:"  // Height 3: input setup done
       "cmp x25, #0x4\n"
       "blt 106f\n"
@@ -1272,107 +1271,107 @@
       "sub x25, x25, #0x4\n"
       "add x24, x24, #0x10\n"
       "fmla v20.4s, v4.4s, v2.s[0]\n"
-      "ldr q4, [x28, #0x40]\n"
+      "ldr q29, [x28, #0x40]\n"
       "fmla v9.4s, v5.4s, v0.s[0]\n"
       "add x23, x23, #0x10\n"
       "fmla v15.4s, v5.4s, v1.s[0]\n"
       "fmla v21.4s, v5.4s, v2.s[0]\n"
-      "ldr q5, [x28, #0x50]\n"
+      "ldr q28, [x28, #0x50]\n"
       "add x22, x22, #0x10\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
       "fmla v16.4s, v6.4s, v1.s[0]\n"
       "cmp x25, #0x8\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "fmla v22.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x28, #0x60]\n"
+      "ldr q27, [x28, #0x60]\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "fmla v17.4s, v7.4s, v1.s[0]\n"
       "fmla v23.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x28, #0x70]\n"
+      "ldr q26, [x28, #0x70]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "fmla v24.4s, v4.4s, v2.s[0]\n"
-      "ldr q4, [x28, #0x80]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "fmla v19.4s, v5.4s, v1.s[0]\n"
-      "fmla v25.4s, v5.4s, v2.s[0]\n"
-      "ldr q5, [x28, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v20.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v21.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      "fmla v10.4s, v4.4s, v0.s[1]\n"
-      "fmla v16.4s, v4.4s, v1.s[1]\n"
-      "fmla v22.4s, v4.4s, v2.s[1]\n"
-      "ldr q4, [x28, #0xc0]\n"
-      "fmla v11.4s, v5.4s, v0.s[1]\n"
-      "fmla v17.4s, v5.4s, v1.s[1]\n"
-      "fmla v23.4s, v5.4s, v2.s[1]\n"
-      "ldr q5, [x28, #0xd0]\n"
-      "fmla v12.4s, v6.4s, v0.s[1]\n"
-      "fmla v18.4s, v6.4s, v1.s[1]\n"
-      "fmla v24.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      "fmla v13.4s, v7.4s, v0.s[1]\n"
-      "fmla v19.4s, v7.4s, v1.s[1]\n"
-      "fmla v25.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "fmla v8.4s, v4.4s, v0.s[2]\n"
-      "fmla v14.4s, v4.4s, v1.s[2]\n"
-      "fmla v20.4s, v4.4s, v2.s[2]\n"
-      "ldr q4, [x28, #0x100]\n"
-      "fmla v9.4s, v5.4s, v0.s[2]\n"
-      "fmla v15.4s, v5.4s, v1.s[2]\n"
-      "fmla v21.4s, v5.4s, v2.s[2]\n"
-      "ldr q5, [x28, #0x110]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v16.4s, v6.4s, v1.s[2]\n"
-      "fmla v22.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x28, #0x120]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v17.4s, v7.4s, v1.s[2]\n"
-      "fmla v23.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x28, #0x130]\n"
-      "fmla v12.4s, v4.4s, v0.s[2]\n"
-      "fmla v18.4s, v4.4s, v1.s[2]\n"
-      "fmla v24.4s, v4.4s, v2.s[2]\n"
-      "ldr q4, [x28, #0x140]\n"
-      "fmla v13.4s, v5.4s, v0.s[2]\n"
-      "fmla v19.4s, v5.4s, v1.s[2]\n"
-      "fmla v25.4s, v5.4s, v2.s[2]\n"
-      "ldr q5, [x28, #0x150]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v20.4s, v6.4s, v2.s[3]\n"
-      "ldr q6, [x28, #0x160]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v21.4s, v7.4s, v2.s[3]\n"
-      "ldr q7, [x28, #0x170]\n"
+      "fmla v12.4s, v29.4s, v0.s[0]\n"
+      "fmla v18.4s, v29.4s, v1.s[0]\n"
+      "fmla v24.4s, v29.4s, v2.s[0]\n"
+      "ldr q29, [x28, #0x80]\n"
+      "fmla v13.4s, v28.4s, v0.s[0]\n"
+      "fmla v19.4s, v28.4s, v1.s[0]\n"
+      "fmla v25.4s, v28.4s, v2.s[0]\n"
+      "ldr q28, [x28, #0x90]\n"
+      "fmla v8.4s, v27.4s, v0.s[1]\n"
+      "fmla v14.4s, v27.4s, v1.s[1]\n"
+      "fmla v20.4s, v27.4s, v2.s[1]\n"
+      "ldr q27, [x28, #0xa0]\n"
+      "fmla v9.4s, v26.4s, v0.s[1]\n"
+      "fmla v15.4s, v26.4s, v1.s[1]\n"
+      "fmla v21.4s, v26.4s, v2.s[1]\n"
+      "ldr q26, [x28, #0xb0]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v16.4s, v29.4s, v1.s[1]\n"
+      "fmla v22.4s, v29.4s, v2.s[1]\n"
+      "ldr q29, [x28, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v17.4s, v28.4s, v1.s[1]\n"
+      "fmla v23.4s, v28.4s, v2.s[1]\n"
+      "ldr q28, [x28, #0xd0]\n"
+      "fmla v12.4s, v27.4s, v0.s[1]\n"
+      "fmla v18.4s, v27.4s, v1.s[1]\n"
+      "fmla v24.4s, v27.4s, v2.s[1]\n"
+      "ldr q27, [x28, #0xe0]\n"
+      "fmla v13.4s, v26.4s, v0.s[1]\n"
+      "fmla v19.4s, v26.4s, v1.s[1]\n"
+      "fmla v25.4s, v26.4s, v2.s[1]\n"
+      "ldr q26, [x28, #0xf0]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v20.4s, v29.4s, v2.s[2]\n"
+      "ldr q29, [x28, #0x100]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v21.4s, v28.4s, v2.s[2]\n"
+      "ldr q28, [x28, #0x110]\n"
+      "fmla v10.4s, v27.4s, v0.s[2]\n"
+      "fmla v16.4s, v27.4s, v1.s[2]\n"
+      "fmla v22.4s, v27.4s, v2.s[2]\n"
+      "ldr q27, [x28, #0x120]\n"
+      "fmla v11.4s, v26.4s, v0.s[2]\n"
+      "fmla v17.4s, v26.4s, v1.s[2]\n"
+      "fmla v23.4s, v26.4s, v2.s[2]\n"
+      "ldr q26, [x28, #0x130]\n"
+      "fmla v12.4s, v29.4s, v0.s[2]\n"
+      "fmla v18.4s, v29.4s, v1.s[2]\n"
+      "fmla v24.4s, v29.4s, v2.s[2]\n"
+      "ldr q29, [x28, #0x140]\n"
+      "fmla v13.4s, v28.4s, v0.s[2]\n"
+      "fmla v19.4s, v28.4s, v1.s[2]\n"
+      "fmla v25.4s, v28.4s, v2.s[2]\n"
+      "ldr q28, [x28, #0x150]\n"
+      "fmla v8.4s, v27.4s, v0.s[3]\n"
+      "fmla v14.4s, v27.4s, v1.s[3]\n"
+      "fmla v20.4s, v27.4s, v2.s[3]\n"
+      "ldr q27, [x28, #0x160]\n"
+      "fmla v9.4s, v26.4s, v0.s[3]\n"
+      "fmla v15.4s, v26.4s, v1.s[3]\n"
+      "fmla v21.4s, v26.4s, v2.s[3]\n"
+      "ldr q26, [x28, #0x170]\n"
       "add x28, x28, #0x180\n"
-      "fmla v10.4s, v4.4s, v0.s[3]\n"
-      "fmla v16.4s, v4.4s, v1.s[3]\n"
-      "fmla v22.4s, v4.4s, v2.s[3]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "fmla v16.4s, v29.4s, v1.s[3]\n"
+      "fmla v22.4s, v29.4s, v2.s[3]\n"
       "ldr q4, [x28, #0x0]\n"
-      "fmla v11.4s, v5.4s, v0.s[3]\n"
-      "fmla v17.4s, v5.4s, v1.s[3]\n"
-      "fmla v23.4s, v5.4s, v2.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "fmla v17.4s, v28.4s, v1.s[3]\n"
+      "fmla v23.4s, v28.4s, v2.s[3]\n"
       "ldr q5, [x28, #0x10]\n"
-      "fmla v12.4s, v6.4s, v0.s[3]\n"
-      "fmla v18.4s, v6.4s, v1.s[3]\n"
-      "fmla v24.4s, v6.4s, v2.s[3]\n"
+      "fmla v12.4s, v27.4s, v0.s[3]\n"
+      "fmla v18.4s, v27.4s, v1.s[3]\n"
+      "fmla v24.4s, v27.4s, v2.s[3]\n"
       "ldr q6, [x28, #0x20]\n"
-      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v26.4s, v0.s[3]\n"
       "ldr q0, [x24, #0x0]\n"
-      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v26.4s, v1.s[3]\n"
       "ldr q1, [x23, #0x0]\n"
-      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "fmla v25.4s, v26.4s, v2.s[3]\n"
       "ldr q2, [x22, #0x0]\n"
       "ldr q7, [x28, #0x30]\n"
       "bge 104b\n"
@@ -1382,133 +1381,133 @@
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       "fmla v20.4s, v4.4s, v2.s[0]\n"
-      "ldr q4, [x28, #0x40]\n"
+      "ldr q29, [x28, #0x40]\n"
       "fmla v9.4s, v5.4s, v0.s[0]\n"
       "add x22, x22, #0x10\n"
       "fmla v15.4s, v5.4s, v1.s[0]\n"
       "fmla v21.4s, v5.4s, v2.s[0]\n"
-      "ldr q5, [x28, #0x50]\n"
+      "ldr q28, [x28, #0x50]\n"
       "sub x25, x25, #0x4\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
       "fmla v16.4s, v6.4s, v1.s[0]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "fmla v22.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x28, #0x60]\n"
+      "ldr q27, [x28, #0x60]\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
       "fmla v17.4s, v7.4s, v1.s[0]\n"
       "fmla v23.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x28, #0x70]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "fmla v24.4s, v4.4s, v2.s[0]\n"
-      "ldr q4, [x28, #0x80]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "fmla v19.4s, v5.4s, v1.s[0]\n"
-      "fmla v25.4s, v5.4s, v2.s[0]\n"
-      "ldr q5, [x28, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v20.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x28, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v21.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      "fmla v10.4s, v4.4s, v0.s[1]\n"
-      "fmla v16.4s, v4.4s, v1.s[1]\n"
-      "fmla v22.4s, v4.4s, v2.s[1]\n"
-      "ldr q4, [x28, #0xc0]\n"
-      "fmla v11.4s, v5.4s, v0.s[1]\n"
-      "fmla v17.4s, v5.4s, v1.s[1]\n"
-      "fmla v23.4s, v5.4s, v2.s[1]\n"
-      "ldr q5, [x28, #0xd0]\n"
-      "fmla v12.4s, v6.4s, v0.s[1]\n"
-      "fmla v18.4s, v6.4s, v1.s[1]\n"
-      "fmla v24.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x28, #0xe0]\n"
-      "fmla v13.4s, v7.4s, v0.s[1]\n"
-      "fmla v19.4s, v7.4s, v1.s[1]\n"
-      "fmla v25.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x28, #0xf0]\n"
-      "fmla v8.4s, v4.4s, v0.s[2]\n"
-      "fmla v14.4s, v4.4s, v1.s[2]\n"
-      "fmla v20.4s, v4.4s, v2.s[2]\n"
-      "ldr q4, [x28, #0x100]\n"
-      "fmla v9.4s, v5.4s, v0.s[2]\n"
-      "fmla v15.4s, v5.4s, v1.s[2]\n"
-      "fmla v21.4s, v5.4s, v2.s[2]\n"
-      "ldr q5, [x28, #0x110]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v16.4s, v6.4s, v1.s[2]\n"
-      "fmla v22.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x28, #0x120]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v17.4s, v7.4s, v1.s[2]\n"
-      "fmla v23.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x28, #0x130]\n"
-      "fmla v12.4s, v4.4s, v0.s[2]\n"
-      "fmla v18.4s, v4.4s, v1.s[2]\n"
-      "fmla v24.4s, v4.4s, v2.s[2]\n"
-      "ldr q4, [x28, #0x140]\n"
-      "fmla v13.4s, v5.4s, v0.s[2]\n"
-      "fmla v19.4s, v5.4s, v1.s[2]\n"
-      "fmla v25.4s, v5.4s, v2.s[2]\n"
-      "ldr q5, [x28, #0x150]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v20.4s, v6.4s, v2.s[3]\n"
-      "ldr q6, [x28, #0x160]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v21.4s, v7.4s, v2.s[3]\n"
-      "ldr q7, [x28, #0x170]\n"
+      "ldr q26, [x28, #0x70]\n"
+      "fmla v12.4s, v29.4s, v0.s[0]\n"
+      "fmla v18.4s, v29.4s, v1.s[0]\n"
+      "fmla v24.4s, v29.4s, v2.s[0]\n"
+      "ldr q29, [x28, #0x80]\n"
+      "fmla v13.4s, v28.4s, v0.s[0]\n"
+      "fmla v19.4s, v28.4s, v1.s[0]\n"
+      "fmla v25.4s, v28.4s, v2.s[0]\n"
+      "ldr q28, [x28, #0x90]\n"
+      "fmla v8.4s, v27.4s, v0.s[1]\n"
+      "fmla v14.4s, v27.4s, v1.s[1]\n"
+      "fmla v20.4s, v27.4s, v2.s[1]\n"
+      "ldr q27, [x28, #0xa0]\n"
+      "fmla v9.4s, v26.4s, v0.s[1]\n"
+      "fmla v15.4s, v26.4s, v1.s[1]\n"
+      "fmla v21.4s, v26.4s, v2.s[1]\n"
+      "ldr q26, [x28, #0xb0]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v16.4s, v29.4s, v1.s[1]\n"
+      "fmla v22.4s, v29.4s, v2.s[1]\n"
+      "ldr q29, [x28, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v17.4s, v28.4s, v1.s[1]\n"
+      "fmla v23.4s, v28.4s, v2.s[1]\n"
+      "ldr q28, [x28, #0xd0]\n"
+      "fmla v12.4s, v27.4s, v0.s[1]\n"
+      "fmla v18.4s, v27.4s, v1.s[1]\n"
+      "fmla v24.4s, v27.4s, v2.s[1]\n"
+      "ldr q27, [x28, #0xe0]\n"
+      "fmla v13.4s, v26.4s, v0.s[1]\n"
+      "fmla v19.4s, v26.4s, v1.s[1]\n"
+      "fmla v25.4s, v26.4s, v2.s[1]\n"
+      "ldr q26, [x28, #0xf0]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v20.4s, v29.4s, v2.s[2]\n"
+      "ldr q29, [x28, #0x100]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v21.4s, v28.4s, v2.s[2]\n"
+      "ldr q28, [x28, #0x110]\n"
+      "fmla v10.4s, v27.4s, v0.s[2]\n"
+      "fmla v16.4s, v27.4s, v1.s[2]\n"
+      "fmla v22.4s, v27.4s, v2.s[2]\n"
+      "ldr q27, [x28, #0x120]\n"
+      "fmla v11.4s, v26.4s, v0.s[2]\n"
+      "fmla v17.4s, v26.4s, v1.s[2]\n"
+      "fmla v23.4s, v26.4s, v2.s[2]\n"
+      "ldr q26, [x28, #0x130]\n"
+      "fmla v12.4s, v29.4s, v0.s[2]\n"
+      "fmla v18.4s, v29.4s, v1.s[2]\n"
+      "fmla v24.4s, v29.4s, v2.s[2]\n"
+      "ldr q29, [x28, #0x140]\n"
+      "fmla v13.4s, v28.4s, v0.s[2]\n"
+      "fmla v19.4s, v28.4s, v1.s[2]\n"
+      "fmla v25.4s, v28.4s, v2.s[2]\n"
+      "ldr q28, [x28, #0x150]\n"
+      "fmla v8.4s, v27.4s, v0.s[3]\n"
+      "fmla v14.4s, v27.4s, v1.s[3]\n"
+      "fmla v20.4s, v27.4s, v2.s[3]\n"
+      "ldr q27, [x28, #0x160]\n"
+      "fmla v9.4s, v26.4s, v0.s[3]\n"
+      "fmla v15.4s, v26.4s, v1.s[3]\n"
+      "fmla v21.4s, v26.4s, v2.s[3]\n"
+      "ldr q26, [x28, #0x170]\n"
       "add x28, x28, #0x180\n"
-      "fmla v10.4s, v4.4s, v0.s[3]\n"
-      "fmla v16.4s, v4.4s, v1.s[3]\n"
-      "fmla v22.4s, v4.4s, v2.s[3]\n"
-      "fmla v11.4s, v5.4s, v0.s[3]\n"
-      "fmla v17.4s, v5.4s, v1.s[3]\n"
-      "fmla v23.4s, v5.4s, v2.s[3]\n"
-      "fmla v12.4s, v6.4s, v0.s[3]\n"
-      "fmla v18.4s, v6.4s, v1.s[3]\n"
-      "fmla v24.4s, v6.4s, v2.s[3]\n"
-      "fmla v13.4s, v7.4s, v0.s[3]\n"
-      "fmla v19.4s, v7.4s, v1.s[3]\n"
-      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "fmla v16.4s, v29.4s, v1.s[3]\n"
+      "fmla v22.4s, v29.4s, v2.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "fmla v17.4s, v28.4s, v1.s[3]\n"
+      "fmla v23.4s, v28.4s, v2.s[3]\n"
+      "fmla v12.4s, v27.4s, v0.s[3]\n"
+      "fmla v18.4s, v27.4s, v1.s[3]\n"
+      "fmla v24.4s, v27.4s, v2.s[3]\n"
+      "fmla v13.4s, v26.4s, v0.s[3]\n"
+      "fmla v19.4s, v26.4s, v1.s[3]\n"
+      "fmla v25.4s, v26.4s, v2.s[3]\n"
       "106:"  // Height 3: Multiply loop: Main loop skip
       "cbz x25, 108f\n"
       "107:"  // Height 3: Multiply loop: Odd block loop
       "ldr s0, [x24], #0x4\n"
-      "ldr s1, [x23], #0x4\n"
+      "ldr s31, [x23], #0x4\n"
       "sub x25, x25, #0x1\n"
-      "ldr s2, [x22], #0x4\n"
-      "ldr q4, [x28, #0x0]\n"
-      "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "fmla v14.4s, v4.4s, v1.s[0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "fmla v20.4s, v4.4s, v2.s[0]\n"
-      "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "ldr q4, [x28, #0x40]\n"
-      "fmla v15.4s, v5.4s, v1.s[0]\n"
-      "fmla v21.4s, v5.4s, v2.s[0]\n"
-      "ldr q5, [x28, #0x50]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "ldr s30, [x22], #0x4\n"
+      "ldr q27, [x28, #0x0]\n"
+      "fmla v8.4s, v27.4s, v0.s[0]\n"
+      "fmla v14.4s, v27.4s, v31.s[0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      "ldr q29, [x28, #0x20]\n"
+      "fmla v20.4s, v27.4s, v30.s[0]\n"
+      "fmla v9.4s, v26.4s, v0.s[0]\n"
+      "ldr q28, [x28, #0x30]\n"
+      "ldr q27, [x28, #0x40]\n"
+      "fmla v15.4s, v26.4s, v31.s[0]\n"
+      "fmla v21.4s, v26.4s, v30.s[0]\n"
+      "ldr q26, [x28, #0x50]\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "fmla v16.4s, v29.4s, v31.s[0]\n"
       "add x28, x28, #0x60\n"
-      "fmla v22.4s, v6.4s, v2.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v17.4s, v7.4s, v1.s[0]\n"
-      "fmla v23.4s, v7.4s, v2.s[0]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "fmla v24.4s, v4.4s, v2.s[0]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "fmla v19.4s, v5.4s, v1.s[0]\n"
-      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "fmla v22.4s, v29.4s, v30.s[0]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "fmla v17.4s, v28.4s, v31.s[0]\n"
+      "fmla v23.4s, v28.4s, v30.s[0]\n"
+      "fmla v12.4s, v27.4s, v0.s[0]\n"
+      "fmla v18.4s, v27.4s, v31.s[0]\n"
+      "fmla v24.4s, v27.4s, v30.s[0]\n"
+      "fmla v13.4s, v26.4s, v0.s[0]\n"
+      "fmla v19.4s, v26.4s, v31.s[0]\n"
+      "fmla v25.4s, v26.4s, v30.s[0]\n"
       "cbnz x25, 107b\n"
       "108:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1523,45 +1522,45 @@
       "prfm pstl1keep, [x22, #0x0]\n"
       "tbz %x[flags], #1, 109f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v27.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmin v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v1.4s\n"
-      "fmin v22.4s, v22.4s, v1.4s\n"
-      "fmin v23.4s, v23.4s, v1.4s\n"
-      "fmin v24.4s, v24.4s, v1.4s\n"
-      "fmin v25.4s, v25.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
-      "fmax v20.4s, v20.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v0.4s\n"
-      "fmax v22.4s, v22.4s, v0.4s\n"
-      "fmax v23.4s, v23.4s, v0.4s\n"
-      "fmax v24.4s, v24.4s, v0.4s\n"
-      "fmax v25.4s, v25.4s, v0.4s\n"
+      "ld1r { v26.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v27.4s\n"
+      "fmin v9.4s, v9.4s, v27.4s\n"
+      "fmin v10.4s, v10.4s, v27.4s\n"
+      "fmin v11.4s, v11.4s, v27.4s\n"
+      "fmin v12.4s, v12.4s, v27.4s\n"
+      "fmin v13.4s, v13.4s, v27.4s\n"
+      "fmin v14.4s, v14.4s, v27.4s\n"
+      "fmin v15.4s, v15.4s, v27.4s\n"
+      "fmin v16.4s, v16.4s, v27.4s\n"
+      "fmin v17.4s, v17.4s, v27.4s\n"
+      "fmin v18.4s, v18.4s, v27.4s\n"
+      "fmin v19.4s, v19.4s, v27.4s\n"
+      "fmin v20.4s, v20.4s, v27.4s\n"
+      "fmin v21.4s, v21.4s, v27.4s\n"
+      "fmin v22.4s, v22.4s, v27.4s\n"
+      "fmin v23.4s, v23.4s, v27.4s\n"
+      "fmin v24.4s, v24.4s, v27.4s\n"
+      "fmin v25.4s, v25.4s, v27.4s\n"
+      "fmax v8.4s, v8.4s, v26.4s\n"
+      "fmax v9.4s, v9.4s, v26.4s\n"
+      "fmax v10.4s, v10.4s, v26.4s\n"
+      "fmax v11.4s, v11.4s, v26.4s\n"
+      "fmax v12.4s, v12.4s, v26.4s\n"
+      "fmax v13.4s, v13.4s, v26.4s\n"
+      "fmax v14.4s, v14.4s, v26.4s\n"
+      "fmax v15.4s, v15.4s, v26.4s\n"
+      "fmax v16.4s, v16.4s, v26.4s\n"
+      "fmax v17.4s, v17.4s, v26.4s\n"
+      "fmax v18.4s, v18.4s, v26.4s\n"
+      "fmax v19.4s, v19.4s, v26.4s\n"
+      "fmax v20.4s, v20.4s, v26.4s\n"
+      "fmax v21.4s, v21.4s, v26.4s\n"
+      "fmax v22.4s, v22.4s, v26.4s\n"
+      "fmax v23.4s, v23.4s, v26.4s\n"
+      "fmax v24.4s, v24.4s, v26.4s\n"
+      "fmax v25.4s, v25.4s, v26.4s\n"
       "109:"  // Height 3: No activation
       "cmp x9, #0x18\n"
       "bge 122f\n"
@@ -1983,14 +1982,14 @@
       "142:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 143f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
       "cbnz x26, 144f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -2000,9 +1999,9 @@
       "b 144f\n"
       "143:"  // Height 4: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "144:"  // Height 4: input setup done
       "cmp x25, #0x4\n"
       "blt 147f\n"
@@ -2283,42 +2282,42 @@
       "147:"  // Height 4: Multiply loop: Main loop skip
       "cbz x25, 149f\n"
       "148:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x24], #0x4\n"
-      "ldr s1, [x23], #0x4\n"
+      "ldr s7, [x24], #0x4\n"
+      "ldr s6, [x23], #0x4\n"
       "sub x25, x25, #0x1\n"
-      "ldr s2, [x22], #0x4\n"
-      "ldr s3, [x21], #0x4\n"
-      "ldr q4, [x28, #0x0]\n"
-      "ldr q5, [x28, #0x10]\n"
-      "fmla v8.4s, v4.4s, v0.s[0]\n"
-      "fmla v14.4s, v4.4s, v1.s[0]\n"
-      "ldr q6, [x28, #0x20]\n"
-      "ldr q7, [x28, #0x30]\n"
-      "fmla v20.4s, v4.4s, v2.s[0]\n"
-      "fmla v26.4s, v4.4s, v3.s[0]\n"
-      "ldr q4, [x28, #0x40]\n"
-      "fmla v9.4s, v5.4s, v0.s[0]\n"
-      "fmla v15.4s, v5.4s, v1.s[0]\n"
-      "fmla v21.4s, v5.4s, v2.s[0]\n"
-      "fmla v27.4s, v5.4s, v3.s[0]\n"
-      "ldr q5, [x28, #0x50]\n"
+      "ldr s5, [x22], #0x4\n"
+      "ldr s4, [x21], #0x4\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q0, [x28, #0x10]\n"
+      "fmla v8.4s, v1.4s, v7.s[0]\n"
+      "fmla v14.4s, v1.4s, v6.s[0]\n"
+      "ldr q3, [x28, #0x20]\n"
+      "ldr q2, [x28, #0x30]\n"
+      "fmla v20.4s, v1.4s, v5.s[0]\n"
+      "fmla v26.4s, v1.4s, v4.s[0]\n"
+      "ldr q1, [x28, #0x40]\n"
+      "fmla v9.4s, v0.4s, v7.s[0]\n"
+      "fmla v15.4s, v0.4s, v6.s[0]\n"
+      "fmla v21.4s, v0.4s, v5.s[0]\n"
+      "fmla v27.4s, v0.4s, v4.s[0]\n"
+      "ldr q0, [x28, #0x50]\n"
       "add x28, x28, #0x60\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v16.4s, v6.4s, v1.s[0]\n"
-      "fmla v22.4s, v6.4s, v2.s[0]\n"
-      "fmla v28.4s, v6.4s, v3.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v17.4s, v7.4s, v1.s[0]\n"
-      "fmla v23.4s, v7.4s, v2.s[0]\n"
-      "fmla v29.4s, v7.4s, v3.s[0]\n"
-      "fmla v12.4s, v4.4s, v0.s[0]\n"
-      "fmla v18.4s, v4.4s, v1.s[0]\n"
-      "fmla v24.4s, v4.4s, v2.s[0]\n"
-      "fmla v30.4s, v4.4s, v3.s[0]\n"
-      "fmla v13.4s, v5.4s, v0.s[0]\n"
-      "fmla v19.4s, v5.4s, v1.s[0]\n"
-      "fmla v25.4s, v5.4s, v2.s[0]\n"
-      "fmla v31.4s, v5.4s, v3.s[0]\n"
+      "fmla v10.4s, v3.4s, v7.s[0]\n"
+      "fmla v16.4s, v3.4s, v6.s[0]\n"
+      "fmla v22.4s, v3.4s, v5.s[0]\n"
+      "fmla v28.4s, v3.4s, v4.s[0]\n"
+      "fmla v11.4s, v2.4s, v7.s[0]\n"
+      "fmla v17.4s, v2.4s, v6.s[0]\n"
+      "fmla v23.4s, v2.4s, v5.s[0]\n"
+      "fmla v29.4s, v2.4s, v4.s[0]\n"
+      "fmla v12.4s, v1.4s, v7.s[0]\n"
+      "fmla v18.4s, v1.4s, v6.s[0]\n"
+      "fmla v24.4s, v1.4s, v5.s[0]\n"
+      "fmla v30.4s, v1.4s, v4.s[0]\n"
+      "fmla v13.4s, v0.4s, v7.s[0]\n"
+      "fmla v19.4s, v0.4s, v6.s[0]\n"
+      "fmla v25.4s, v0.4s, v5.s[0]\n"
+      "fmla v31.4s, v0.4s, v4.s[0]\n"
       "cbnz x25, 148b\n"
       "149:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2584,7 +2583,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "166:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
index 4cfa18b..759729d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -113,5 +113,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
index 985d57d..ddbc840 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
@@ -92,7 +92,6 @@
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 166f\n"
@@ -189,11 +188,11 @@
       "15:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 16f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
       "cbnz x15, 17f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20, LSL #2\n"
@@ -210,126 +209,126 @@
       "blt 19f\n"
       "18:"  // Height 1: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr d6, [x17, #0x20]\n"
-      "ldr x12, [x17, #0x28]\n"
+      "ldr d17, [x17, #0x20]\n"
+      "ldr x20, [x17, #0x28]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x38]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "ldr x12, [x17, #0x48]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x58]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "ldr x12, [x17, #0x68]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr d7, [x17, #0x70]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x78]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "ldr x12, [x17, #0x88]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x98]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "ldr x12, [x17, #0xa8]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "ldr x12, [x17, #0xc8]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "ldr x12, [x17, #0xe8]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "mov v7.d[1], x11\n"
+      "ldr d16, [x17, #0x30]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x38]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "ldr d17, [x17, #0x40]\n"
+      "ldr x20, [x17, #0x48]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "ldr d16, [x17, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr d17, [x17, #0x60]\n"
+      "ldr x20, [x17, #0x68]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr d16, [x17, #0x70]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x78]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "ldr d17, [x17, #0x80]\n"
+      "ldr x20, [x17, #0x88]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "ldr d16, [x17, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "ldr d17, [x17, #0xa0]\n"
+      "ldr x20, [x17, #0xa8]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "ldr d16, [x17, #0xb0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr d17, [x17, #0xc0]\n"
+      "ldr x20, [x17, #0xc8]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr d16, [x17, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr d17, [x17, #0xe0]\n"
+      "ldr x20, [x17, #0xe8]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr d16, [x17, #0xf0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "mov v16.d[1], x20\n"
       "add x13, x13, #0x10\n"
       "add x17, x17, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
       "ldr d6, [x17, #0x0]\n"
-      "ldr x12, [x17, #0x8]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "ldr x20, [x17, #0x8]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
       "ldr d0, [x13, #0x0]\n"
       "sub x14, x14, #0x4\n"
       "ldr d7, [x17, #0x10]\n"
       "cmp x14, #0x8\n"
-      "ldr x10, [x13, #0x8]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x18]\n"
-      "mov v0.d[1], x10\n"
-      "mov v7.d[1], x11\n"
+      "ldr x21, [x13, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x17, #0x18]\n"
+      "mov v0.d[1], x21\n"
+      "mov v7.d[1], x20\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       "bge 18b\n"
       "19:"  // Height 1: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "ldr q17, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "ldr q17, [x17, #0x40]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "ldr q16, [x17, #0x50]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x17, #0x60]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x17, #0x70]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x17, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x17, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x17, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x17, #0xb0]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x17, #0xc0]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x17, #0xd0]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x17, #0xe0]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x17, #0xf0]\n"
       "add x13, x13, #0x10\n"
       "sub x14, x14, #0x4\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
       "add x17, x17, #0x100\n"
       "20:"  // Height 1: Multiply loop: Main loop skip
       "cbz x14, 22f\n"
       "21:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s17, [x13], #0x4\n"
       "sub x14, x14, #0x1\n"
-      "ldr q6, [x17, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr q16, [x17, #0x0]\n"
+      "fmla v8.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x17, #0x10]\n"
+      "fmla v9.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x17, #0x20]\n"
+      "fmla v10.4s, v16.4s, v17.s[0]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v11.4s, v16.4s, v17.s[0]\n"
       "add x17, x17, #0x40\n"
       "cbnz x14, 21b\n"
       "22:"  // Height 1: Multiply loop: No odd multiplies
@@ -340,17 +339,17 @@
       "prfm pstl1keep, [x16, #0x0]\n"
       "tbz %x[flags], #1, 23f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v16.4s\n"
+      "fmin v9.4s, v9.4s, v16.4s\n"
+      "fmin v10.4s, v10.4s, v16.4s\n"
+      "fmin v11.4s, v11.4s, v16.4s\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
       "23:"  // Height 1: No activation
       "cmp x8, #0x10\n"
       "bge 32f\n"
@@ -528,196 +527,196 @@
       "48:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 49f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
       "cbnz x15, 50f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20, LSL #2\n"
-      "add x9, x9, x20, LSL #2\n"
+      "add x12, x12, x20, LSL #2\n"
       "b 50f\n"
       "49:"  // Height 2: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20, LSL #2\n"
+      "add x12, x13, x21, LSL #2\n"
       "50:"  // Height 2: input setup done
       "cmp x14, #0x4\n"
       "blt 53f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x8\n"
-      "ldr q1, [x9, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
       "ldr q7, [x17, #0x10]\n"
       "blt 52f\n"
       "51:"  // Height 2: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr x12, [x17, #0x28]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr d6, [x17, #0x20]\n"
+      "ldr d17, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr x12, [x17, #0x48]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x58]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr x12, [x17, #0x68]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr d7, [x17, #0x70]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "ldr x12, [x17, #0x88]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x98]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "ldr x12, [x17, #0xa8]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr x12, [x17, #0xc8]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr x12, [x17, #0xe8]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "mov v6.d[1], x12\n"
+      "ldr d16, [x17, #0x30]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.4s, v17.4s, v1.s[0]\n"
+      "ldr d17, [x17, #0x40]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "ldr x20, [x17, #0x48]\n"
+      "fmla v15.4s, v16.4s, v1.s[0]\n"
+      "ldr d16, [x17, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v12.4s, v17.4s, v1.s[1]\n"
+      "ldr d17, [x17, #0x60]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v13.4s, v16.4s, v1.s[1]\n"
+      "ldr d16, [x17, #0x70]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.4s, v17.4s, v1.s[1]\n"
+      "ldr d17, [x17, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "ldr x20, [x17, #0x88]\n"
+      "fmla v15.4s, v16.4s, v1.s[1]\n"
+      "ldr d16, [x17, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v12.4s, v17.4s, v1.s[2]\n"
+      "ldr d17, [x17, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v13.4s, v16.4s, v1.s[2]\n"
+      "ldr d16, [x17, #0xb0]\n"
+      "mov v17.d[1], x21\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v14.4s, v17.4s, v1.s[2]\n"
+      "ldr d17, [x17, #0xc0]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr x20, [x17, #0xc8]\n"
+      "fmla v15.4s, v16.4s, v1.s[2]\n"
+      "ldr d16, [x17, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v12.4s, v17.4s, v1.s[3]\n"
+      "ldr d17, [x17, #0xe0]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v13.4s, v16.4s, v1.s[3]\n"
+      "ldr d16, [x17, #0xf0]\n"
+      "mov v17.d[1], x21\n"
       "add x13, x13, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "add x9, x9, #0x10\n"
+      "mov v16.d[1], x20\n"
+      "add x12, x12, #0x10\n"
       "add x17, x17, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v14.4s, v17.4s, v1.s[3]\n"
       "ldr d6, [x17, #0x0]\n"
-      "ldr x12, [x17, #0x8]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "ldr x21, [x17, #0x8]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
       "ldr d0, [x13, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "fmla v15.4s, v16.4s, v1.s[3]\n"
+      "ldr d1, [x12, #0x0]\n"
       "sub x14, x14, #0x4\n"
       "ldr d7, [x17, #0x10]\n"
       "cmp x14, #0x8\n"
-      "ldr x10, [x13, #0x8]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x28, [x9, #0x8]\n"
-      "mov v0.d[1], x10\n"
-      "ldr x11, [x17, #0x18]\n"
-      "mov v1.d[1], x28\n"
+      "ldr x20, [x13, #0x8]\n"
+      "mov v6.d[1], x21\n"
+      "ldr x21, [x12, #0x8]\n"
+      "mov v0.d[1], x20\n"
+      "ldr x20, [x17, #0x18]\n"
+      "mov v1.d[1], x21\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "bge 51b\n"
       "52:"  // Height 2: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "ldr q17, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
       "sub x14, x14, #0x4\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v14.4s, v17.4s, v1.s[0]\n"
+      "ldr q17, [x17, #0x40]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v15.4s, v16.4s, v1.s[0]\n"
+      "ldr q16, [x17, #0x50]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v12.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x17, #0x60]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "fmla v13.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x17, #0x70]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "fmla v14.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x17, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "fmla v15.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x17, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "fmla v12.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x17, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "fmla v13.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x17, #0xb0]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "fmla v14.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x17, #0xc0]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "fmla v15.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x17, #0xd0]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "fmla v12.4s, v17.4s, v1.s[3]\n"
+      "ldr q17, [x17, #0xe0]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v1.s[3]\n"
+      "ldr q16, [x17, #0xf0]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
       "add x17, x17, #0x100\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v14.4s, v17.4s, v1.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
+      "fmla v15.4s, v16.4s, v1.s[3]\n"
       "53:"  // Height 2: Multiply loop: Main loop skip
       "cbz x14, 55f\n"
       "54:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s19, [x13], #0x4\n"
       "sub x14, x14, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr q6, [x17, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr s18, [x12], #0x4\n"
+      "ldr q17, [x17, #0x0]\n"
+      "fmla v8.4s, v17.4s, v19.s[0]\n"
+      "ldr q16, [x17, #0x10]\n"
+      "fmla v12.4s, v17.4s, v18.s[0]\n"
+      "ldr q17, [x17, #0x20]\n"
+      "fmla v9.4s, v16.4s, v19.s[0]\n"
+      "fmla v13.4s, v16.4s, v18.s[0]\n"
+      "ldr q16, [x17, #0x30]\n"
+      "fmla v10.4s, v17.4s, v19.s[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v14.4s, v17.4s, v18.s[0]\n"
+      "fmla v11.4s, v16.4s, v19.s[0]\n"
+      "fmla v15.4s, v16.4s, v18.s[0]\n"
       "cbnz x14, 54b\n"
       "55:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -730,25 +729,25 @@
       "prfm pstl1keep, [x25, #0x0]\n"
       "tbz %x[flags], #1, 56f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v16.4s\n"
+      "fmin v9.4s, v9.4s, v16.4s\n"
+      "fmin v10.4s, v10.4s, v16.4s\n"
+      "fmin v11.4s, v11.4s, v16.4s\n"
+      "fmin v12.4s, v12.4s, v16.4s\n"
+      "fmin v13.4s, v13.4s, v16.4s\n"
+      "fmin v14.4s, v14.4s, v16.4s\n"
+      "fmin v15.4s, v15.4s, v16.4s\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
+      "fmax v12.4s, v12.4s, v16.4s\n"
+      "fmax v13.4s, v13.4s, v16.4s\n"
+      "fmax v14.4s, v14.4s, v16.4s\n"
+      "fmax v15.4s, v15.4s, v16.4s\n"
       "56:"  // Height 2: No activation
       "cmp x8, #0x10\n"
       "bge 65f\n"
@@ -975,244 +974,244 @@
       "81:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 82f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
       "cbnz x15, 83f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20, LSL #2\n"
-      "add x9, x9, x20, LSL #2\n"
-      "add x27, x27, x20, LSL #2\n"
+      "add x12, x12, x20, LSL #2\n"
+      "add x11, x11, x20, LSL #2\n"
       "b 83f\n"
       "82:"  // Height 3: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20, LSL #2\n"
-      "add x27, x9, x20, LSL #2\n"
+      "add x12, x13, x21, LSL #2\n"
+      "add x11, x12, x21, LSL #2\n"
       "83:"  // Height 3: input setup done
       "cmp x14, #0x4\n"
       "blt 86f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x8\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
       "ldr q7, [x17, #0x10]\n"
       "blt 85f\n"
       "84:"  // Height 3: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr x12, [x17, #0x28]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr d6, [x17, #0x20]\n"
+      "ldr d21, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v21.d[1], x21\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr x12, [x17, #0x48]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x58]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr x12, [x17, #0x68]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr x12, [x17, #0x88]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "ldr d7, [x17, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr x12, [x17, #0xa8]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "ldr x12, [x17, #0xc8]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "ldr x12, [x17, #0xe8]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "ldr d20, [x17, #0x30]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.4s, v21.4s, v0.s[0]\n"
+      "fmla v14.4s, v21.4s, v1.s[0]\n"
+      "ldr x20, [x17, #0x58]\n"
+      "fmla v18.4s, v21.4s, v2.s[0]\n"
+      "ldr d21, [x17, #0x40]\n"
+      "fmla v11.4s, v20.4s, v0.s[0]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.4s, v20.4s, v1.s[0]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v19.4s, v20.4s, v2.s[0]\n"
+      "ldr d20, [x17, #0x50]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v12.4s, v21.4s, v1.s[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v16.4s, v21.4s, v2.s[1]\n"
+      "ldr d21, [x17, #0x60]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.4s, v20.4s, v1.s[1]\n"
+      "ldr x21, [x17, #0x88]\n"
+      "fmla v17.4s, v20.4s, v2.s[1]\n"
+      "ldr d20, [x17, #0x70]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "ldr x20, [x17, #0x98]\n"
+      "fmla v18.4s, v21.4s, v2.s[1]\n"
+      "ldr d21, [x17, #0x80]\n"
+      "fmla v11.4s, v20.4s, v0.s[1]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v19.4s, v20.4s, v2.s[1]\n"
+      "ldr d20, [x17, #0x90]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.4s, v21.4s, v0.s[2]\n"
+      "fmla v12.4s, v21.4s, v1.s[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v16.4s, v21.4s, v2.s[2]\n"
+      "ldr d21, [x17, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[2]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.4s, v20.4s, v1.s[2]\n"
+      "ldr x21, [x17, #0xc8]\n"
+      "fmla v17.4s, v20.4s, v2.s[2]\n"
+      "ldr d20, [x17, #0xb0]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v14.4s, v21.4s, v1.s[2]\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "fmla v18.4s, v21.4s, v2.s[2]\n"
+      "ldr d21, [x17, #0xc0]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v15.4s, v20.4s, v1.s[2]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v19.4s, v20.4s, v2.s[2]\n"
+      "ldr d20, [x17, #0xd0]\n"
+      "mov v20.d[1], x20\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v12.4s, v21.4s, v1.s[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v16.4s, v21.4s, v2.s[3]\n"
+      "ldr d21, [x17, #0xe0]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "mov v21.d[1], x21\n"
+      "fmla v13.4s, v20.4s, v1.s[3]\n"
       "add x13, x13, #0x10\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "mov v7.d[1], x11\n"
-      "add x9, x9, #0x10\n"
-      "add x27, x27, #0x10\n"
+      "fmla v17.4s, v20.4s, v2.s[3]\n"
+      "ldr d20, [x17, #0xf0]\n"
+      "mov v20.d[1], x20\n"
+      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
       "add x17, x17, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "ldr x12, [x17, #0x8]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "ldr x10, [x13, #0x8]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v10.4s, v21.4s, v0.s[3]\n"
+      "ldr x20, [x17, #0x8]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "ldr x23, [x13, #0x8]\n"
+      "fmla v18.4s, v21.4s, v2.s[3]\n"
       "ldr d6, [x17, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v11.4s, v20.4s, v0.s[3]\n"
       "ldr d0, [x13, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      "ldr x28, [x9, #0x8]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "ldr x22, [x12, #0x8]\n"
+      "fmla v19.4s, v20.4s, v2.s[3]\n"
+      "ldr d2, [x11, #0x0]\n"
       "sub x14, x14, #0x4\n"
       "ldr d7, [x17, #0x10]\n"
       "cmp x14, #0x8\n"
-      "ldr x26, [x27, #0x8]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x17, #0x18]\n"
-      "mov v0.d[1], x10\n"
+      "ldr x21, [x11, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x17, #0x18]\n"
+      "mov v0.d[1], x23\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "mov v1.d[1], x28\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "mov v2.d[1], x26\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "mov v7.d[1], x11\n"
+      "mov v1.d[1], x22\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "mov v2.d[1], x21\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "mov v7.d[1], x20\n"
       "bge 84b\n"
       "85:"  // Height 3: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "ldr q21, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
       "sub x14, x14, #0x4\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q20, [x17, #0x30]\n"
+      "fmla v10.4s, v21.4s, v0.s[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v18.4s, v21.4s, v2.s[0]\n"
+      "ldr q21, [x17, #0x40]\n"
+      "fmla v11.4s, v20.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v15.4s, v20.4s, v1.s[0]\n"
+      "fmla v19.4s, v20.4s, v2.s[0]\n"
+      "ldr q20, [x17, #0x50]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v12.4s, v21.4s, v1.s[1]\n"
+      "fmla v16.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x17, #0x60]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v13.4s, v20.4s, v1.s[1]\n"
+      "fmla v17.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x17, #0x70]\n"
+      "fmla v10.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "fmla v18.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x17, #0x80]\n"
+      "fmla v11.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "fmla v19.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x17, #0x90]\n"
+      "fmla v8.4s, v21.4s, v0.s[2]\n"
+      "fmla v12.4s, v21.4s, v1.s[2]\n"
+      "fmla v16.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x17, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[2]\n"
+      "fmla v13.4s, v20.4s, v1.s[2]\n"
+      "fmla v17.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x17, #0xb0]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v14.4s, v21.4s, v1.s[2]\n"
+      "fmla v18.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x17, #0xc0]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "fmla v15.4s, v20.4s, v1.s[2]\n"
+      "fmla v19.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x17, #0xd0]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v12.4s, v21.4s, v1.s[3]\n"
+      "fmla v16.4s, v21.4s, v2.s[3]\n"
+      "ldr q21, [x17, #0xe0]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "fmla v13.4s, v20.4s, v1.s[3]\n"
+      "fmla v17.4s, v20.4s, v2.s[3]\n"
+      "ldr q20, [x17, #0xf0]\n"
+      "fmla v10.4s, v21.4s, v0.s[3]\n"
       "add x17, x17, #0x100\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "fmla v18.4s, v21.4s, v2.s[3]\n"
+      "fmla v11.4s, v20.4s, v0.s[3]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "fmla v19.4s, v20.4s, v2.s[3]\n"
       "86:"  // Height 3: Multiply loop: Main loop skip
       "cbz x14, 88f\n"
       "87:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s24, [x13], #0x4\n"
       "sub x14, x14, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr q6, [x17, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr s23, [x12], #0x4\n"
+      "ldr s22, [x11], #0x4\n"
+      "ldr q21, [x17, #0x0]\n"
+      "fmla v8.4s, v21.4s, v24.s[0]\n"
+      "ldr q20, [x17, #0x10]\n"
+      "fmla v12.4s, v21.4s, v23.s[0]\n"
+      "fmla v16.4s, v21.4s, v22.s[0]\n"
+      "ldr q21, [x17, #0x20]\n"
+      "fmla v9.4s, v20.4s, v24.s[0]\n"
+      "fmla v13.4s, v20.4s, v23.s[0]\n"
+      "fmla v17.4s, v20.4s, v22.s[0]\n"
+      "ldr q20, [x17, #0x30]\n"
+      "fmla v10.4s, v21.4s, v24.s[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v14.4s, v21.4s, v23.s[0]\n"
+      "fmla v18.4s, v21.4s, v22.s[0]\n"
+      "fmla v11.4s, v20.4s, v24.s[0]\n"
+      "fmla v15.4s, v20.4s, v23.s[0]\n"
+      "fmla v19.4s, v20.4s, v22.s[0]\n"
       "cbnz x14, 87b\n"
       "88:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1227,33 +1226,33 @@
       "prfm pstl1keep, [x24, #0x0]\n"
       "tbz %x[flags], #1, 89f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v20.4s\n"
+      "fmin v9.4s, v9.4s, v20.4s\n"
+      "fmin v10.4s, v10.4s, v20.4s\n"
+      "fmin v11.4s, v11.4s, v20.4s\n"
+      "fmin v12.4s, v12.4s, v20.4s\n"
+      "fmin v13.4s, v13.4s, v20.4s\n"
+      "fmin v14.4s, v14.4s, v20.4s\n"
+      "fmin v15.4s, v15.4s, v20.4s\n"
+      "fmin v16.4s, v16.4s, v20.4s\n"
+      "fmin v17.4s, v17.4s, v20.4s\n"
+      "fmin v18.4s, v18.4s, v20.4s\n"
+      "fmin v19.4s, v19.4s, v20.4s\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v20.4s\n"
+      "fmax v9.4s, v9.4s, v20.4s\n"
+      "fmax v10.4s, v10.4s, v20.4s\n"
+      "fmax v11.4s, v11.4s, v20.4s\n"
+      "fmax v12.4s, v12.4s, v20.4s\n"
+      "fmax v13.4s, v13.4s, v20.4s\n"
+      "fmax v14.4s, v14.4s, v20.4s\n"
+      "fmax v15.4s, v15.4s, v20.4s\n"
+      "fmax v16.4s, v16.4s, v20.4s\n"
+      "fmax v17.4s, v17.4s, v20.4s\n"
+      "fmax v18.4s, v18.4s, v20.4s\n"
+      "fmax v19.4s, v19.4s, v20.4s\n"
       "89:"  // Height 3: No activation
       "cmp x8, #0x10\n"
       "bge 98f\n"
@@ -1529,292 +1528,292 @@
       "114:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 115f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
       "cbnz x15, 116f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20, LSL #2\n"
-      "add x9, x9, x20, LSL #2\n"
-      "add x27, x27, x20, LSL #2\n"
-      "add x25, x25, x20, LSL #2\n"
+      "add x12, x12, x20, LSL #2\n"
+      "add x11, x11, x20, LSL #2\n"
+      "add x10, x10, x20, LSL #2\n"
       "b 116f\n"
       "115:"  // Height 4: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20, LSL #2\n"
-      "add x27, x9, x20, LSL #2\n"
-      "add x25, x27, x20, LSL #2\n"
+      "add x12, x13, x21, LSL #2\n"
+      "add x11, x12, x21, LSL #2\n"
+      "add x10, x11, x21, LSL #2\n"
       "116:"  // Height 4: input setup done
       "cmp x14, #0x4\n"
       "blt 119f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x8\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
       "ldr q7, [x17, #0x10]\n"
       "blt 118f\n"
       "117:"  // Height 4: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr x12, [x17, #0x28]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "ldr d6, [x17, #0x20]\n"
+      "ldr d25, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v25.d[1], x21\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr x12, [x17, #0x48]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x58]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "add x27, x27, #0x10\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr x12, [x17, #0x68]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "add x25, x25, #0x10\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "ldr x10, [x13, #0x8]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr x12, [x17, #0x88]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "ldr x28, [x9, #0x8]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "ldr d7, [x17, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "ldr x26, [x27, #0x8]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr x12, [x17, #0xa8]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "ldr x24, [x25, #0x8]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "ldr d24, [x17, #0x30]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.4s, v25.4s, v0.s[0]\n"
+      "fmla v14.4s, v25.4s, v1.s[0]\n"
+      "ldr x20, [x17, #0x58]\n"
+      "fmla v18.4s, v25.4s, v2.s[0]\n"
+      "add x11, x11, #0x10\n"
+      "fmla v22.4s, v25.4s, v3.s[0]\n"
+      "ldr d25, [x17, #0x40]\n"
+      "fmla v11.4s, v24.4s, v0.s[0]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.4s, v24.4s, v1.s[0]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v19.4s, v24.4s, v2.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v23.4s, v24.4s, v3.s[0]\n"
+      "ldr d24, [x17, #0x50]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.4s, v25.4s, v0.s[1]\n"
+      "fmla v12.4s, v25.4s, v1.s[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v16.4s, v25.4s, v2.s[1]\n"
+      "ldr x25, [x13, #0x8]\n"
+      "fmla v20.4s, v25.4s, v3.s[1]\n"
+      "ldr d25, [x17, #0x60]\n"
+      "fmla v9.4s, v24.4s, v0.s[1]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.4s, v24.4s, v1.s[1]\n"
+      "ldr x21, [x17, #0x88]\n"
+      "fmla v17.4s, v24.4s, v2.s[1]\n"
+      "ldr x24, [x12, #0x8]\n"
+      "fmla v21.4s, v24.4s, v3.s[1]\n"
+      "ldr d24, [x17, #0x70]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.4s, v25.4s, v0.s[1]\n"
+      "fmla v14.4s, v25.4s, v1.s[1]\n"
+      "ldr x20, [x17, #0x98]\n"
+      "fmla v18.4s, v25.4s, v2.s[1]\n"
+      "ldr x23, [x11, #0x8]\n"
+      "fmla v22.4s, v25.4s, v3.s[1]\n"
+      "ldr d25, [x17, #0x80]\n"
+      "fmla v11.4s, v24.4s, v0.s[1]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.4s, v24.4s, v1.s[1]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v19.4s, v24.4s, v2.s[1]\n"
+      "ldr x22, [x10, #0x8]\n"
+      "fmla v23.4s, v24.4s, v3.s[1]\n"
+      "ldr d24, [x17, #0x90]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.4s, v25.4s, v0.s[2]\n"
+      "fmla v12.4s, v25.4s, v1.s[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v16.4s, v25.4s, v2.s[2]\n"
       "sub x14, x14, #0x4\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "ldr x12, [x17, #0xc8]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v20.4s, v25.4s, v3.s[2]\n"
+      "ldr d25, [x17, #0xa0]\n"
+      "fmla v9.4s, v24.4s, v0.s[2]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.4s, v24.4s, v1.s[2]\n"
+      "ldr x21, [x17, #0xc8]\n"
+      "fmla v17.4s, v24.4s, v2.s[2]\n"
       "cmp x14, #0x8\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v21.4s, v24.4s, v3.s[2]\n"
+      "ldr d24, [x17, #0xb0]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v10.4s, v25.4s, v0.s[2]\n"
+      "fmla v14.4s, v25.4s, v1.s[2]\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "fmla v18.4s, v25.4s, v2.s[2]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "ldr x12, [x17, #0xe8]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "mov v7.d[1], x11\n"
+      "fmla v22.4s, v25.4s, v3.s[2]\n"
+      "ldr d25, [x17, #0xc0]\n"
+      "fmla v11.4s, v24.4s, v0.s[2]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v15.4s, v24.4s, v1.s[2]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v19.4s, v24.4s, v2.s[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v23.4s, v24.4s, v3.s[2]\n"
+      "ldr d24, [x17, #0xd0]\n"
+      "mov v24.d[1], x20\n"
+      "fmla v8.4s, v25.4s, v0.s[3]\n"
+      "fmla v12.4s, v25.4s, v1.s[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v16.4s, v25.4s, v2.s[3]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v20.4s, v25.4s, v3.s[3]\n"
+      "ldr d25, [x17, #0xe0]\n"
+      "fmla v9.4s, v24.4s, v0.s[3]\n"
+      "mov v25.d[1], x21\n"
+      "fmla v13.4s, v24.4s, v1.s[3]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v17.4s, v24.4s, v2.s[3]\n"
+      "fmla v21.4s, v24.4s, v3.s[3]\n"
+      "ldr d24, [x17, #0xf0]\n"
+      "mov v24.d[1], x20\n"
       "add x17, x17, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "ldr x12, [x17, #0x8]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "ldr x11, [x17, #0x18]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v10.4s, v25.4s, v0.s[3]\n"
+      "ldr x21, [x17, #0x8]\n"
+      "fmla v14.4s, v25.4s, v1.s[3]\n"
+      "ldr x20, [x17, #0x18]\n"
+      "fmla v18.4s, v25.4s, v2.s[3]\n"
+      "fmla v22.4s, v25.4s, v3.s[3]\n"
       "ldr d6, [x17, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v11.4s, v24.4s, v0.s[3]\n"
       "ldr d0, [x13, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
-      "ldr d3, [x25, #0x0]\n"
+      "fmla v15.4s, v24.4s, v1.s[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "fmla v19.4s, v24.4s, v2.s[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      "fmla v23.4s, v24.4s, v3.s[3]\n"
+      "ldr d3, [x10, #0x0]\n"
       "ldr d7, [x17, #0x10]\n"
-      "mov v6.d[1], x12\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
-      "mov v3.d[1], x24\n"
-      "mov v7.d[1], x11\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x25\n"
+      "mov v1.d[1], x24\n"
+      "mov v2.d[1], x23\n"
+      "mov v3.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 117b\n"
       "118:"  // Height 4: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "ldr q25, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
       "sub x14, x14, #0x4\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "ldr q24, [x17, #0x30]\n"
+      "fmla v10.4s, v25.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v14.4s, v25.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v18.4s, v25.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v22.4s, v25.4s, v3.s[0]\n"
+      "ldr q25, [x17, #0x40]\n"
+      "fmla v11.4s, v24.4s, v0.s[0]\n"
+      "fmla v15.4s, v24.4s, v1.s[0]\n"
+      "fmla v19.4s, v24.4s, v2.s[0]\n"
+      "fmla v23.4s, v24.4s, v3.s[0]\n"
+      "ldr q24, [x17, #0x50]\n"
+      "fmla v8.4s, v25.4s, v0.s[1]\n"
+      "fmla v12.4s, v25.4s, v1.s[1]\n"
+      "fmla v16.4s, v25.4s, v2.s[1]\n"
+      "fmla v20.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x17, #0x60]\n"
+      "fmla v9.4s, v24.4s, v0.s[1]\n"
+      "fmla v13.4s, v24.4s, v1.s[1]\n"
+      "fmla v17.4s, v24.4s, v2.s[1]\n"
+      "fmla v21.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x17, #0x70]\n"
+      "fmla v10.4s, v25.4s, v0.s[1]\n"
+      "fmla v14.4s, v25.4s, v1.s[1]\n"
+      "fmla v18.4s, v25.4s, v2.s[1]\n"
+      "fmla v22.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x17, #0x80]\n"
+      "fmla v11.4s, v24.4s, v0.s[1]\n"
+      "fmla v15.4s, v24.4s, v1.s[1]\n"
+      "fmla v19.4s, v24.4s, v2.s[1]\n"
+      "fmla v23.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x17, #0x90]\n"
+      "fmla v8.4s, v25.4s, v0.s[2]\n"
+      "fmla v12.4s, v25.4s, v1.s[2]\n"
+      "fmla v16.4s, v25.4s, v2.s[2]\n"
+      "fmla v20.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x17, #0xa0]\n"
+      "fmla v9.4s, v24.4s, v0.s[2]\n"
+      "fmla v13.4s, v24.4s, v1.s[2]\n"
+      "fmla v17.4s, v24.4s, v2.s[2]\n"
+      "fmla v21.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x17, #0xb0]\n"
+      "fmla v10.4s, v25.4s, v0.s[2]\n"
+      "fmla v14.4s, v25.4s, v1.s[2]\n"
+      "fmla v18.4s, v25.4s, v2.s[2]\n"
+      "fmla v22.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x17, #0xc0]\n"
+      "fmla v11.4s, v24.4s, v0.s[2]\n"
+      "fmla v15.4s, v24.4s, v1.s[2]\n"
+      "fmla v19.4s, v24.4s, v2.s[2]\n"
+      "fmla v23.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x17, #0xd0]\n"
+      "fmla v8.4s, v25.4s, v0.s[3]\n"
+      "fmla v12.4s, v25.4s, v1.s[3]\n"
+      "fmla v16.4s, v25.4s, v2.s[3]\n"
+      "fmla v20.4s, v25.4s, v3.s[3]\n"
+      "ldr q25, [x17, #0xe0]\n"
+      "fmla v9.4s, v24.4s, v0.s[3]\n"
+      "fmla v13.4s, v24.4s, v1.s[3]\n"
+      "fmla v17.4s, v24.4s, v2.s[3]\n"
+      "fmla v21.4s, v24.4s, v3.s[3]\n"
+      "ldr q24, [x17, #0xf0]\n"
+      "fmla v10.4s, v25.4s, v0.s[3]\n"
       "add x17, x17, #0x100\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v14.4s, v25.4s, v1.s[3]\n"
+      "fmla v18.4s, v25.4s, v2.s[3]\n"
+      "fmla v22.4s, v25.4s, v3.s[3]\n"
+      "fmla v11.4s, v24.4s, v0.s[3]\n"
+      "fmla v15.4s, v24.4s, v1.s[3]\n"
+      "fmla v19.4s, v24.4s, v2.s[3]\n"
+      "fmla v23.4s, v24.4s, v3.s[3]\n"
       "119:"  // Height 4: Multiply loop: Main loop skip
       "cbz x14, 121f\n"
       "120:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s29, [x13], #0x4\n"
       "sub x14, x14, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr q6, [x17, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr s28, [x12], #0x4\n"
+      "ldr s27, [x11], #0x4\n"
+      "ldr s26, [x10], #0x4\n"
+      "ldr q25, [x17, #0x0]\n"
+      "fmla v8.4s, v25.4s, v29.s[0]\n"
+      "ldr q24, [x17, #0x10]\n"
+      "fmla v12.4s, v25.4s, v28.s[0]\n"
+      "fmla v16.4s, v25.4s, v27.s[0]\n"
+      "fmla v20.4s, v25.4s, v26.s[0]\n"
+      "ldr q25, [x17, #0x20]\n"
+      "fmla v9.4s, v24.4s, v29.s[0]\n"
+      "fmla v13.4s, v24.4s, v28.s[0]\n"
+      "fmla v17.4s, v24.4s, v27.s[0]\n"
+      "fmla v21.4s, v24.4s, v26.s[0]\n"
+      "ldr q24, [x17, #0x30]\n"
+      "fmla v10.4s, v25.4s, v29.s[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v14.4s, v25.4s, v28.s[0]\n"
+      "fmla v18.4s, v25.4s, v27.s[0]\n"
+      "fmla v22.4s, v25.4s, v26.s[0]\n"
+      "fmla v11.4s, v24.4s, v29.s[0]\n"
+      "fmla v15.4s, v24.4s, v28.s[0]\n"
+      "fmla v19.4s, v24.4s, v27.s[0]\n"
+      "fmla v23.4s, v24.4s, v26.s[0]\n"
       "cbnz x14, 120b\n"
       "121:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1831,41 +1830,41 @@
       "prfm pstl1keep, [x23, #0x0]\n"
       "tbz %x[flags], #1, 122f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
-      "fmin v20.4s, v20.4s, v0.4s\n"
-      "fmin v21.4s, v21.4s, v0.4s\n"
-      "fmin v22.4s, v22.4s, v0.4s\n"
-      "fmin v23.4s, v23.4s, v0.4s\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v24.4s\n"
+      "fmin v9.4s, v9.4s, v24.4s\n"
+      "fmin v10.4s, v10.4s, v24.4s\n"
+      "fmin v11.4s, v11.4s, v24.4s\n"
+      "fmin v12.4s, v12.4s, v24.4s\n"
+      "fmin v13.4s, v13.4s, v24.4s\n"
+      "fmin v14.4s, v14.4s, v24.4s\n"
+      "fmin v15.4s, v15.4s, v24.4s\n"
+      "fmin v16.4s, v16.4s, v24.4s\n"
+      "fmin v17.4s, v17.4s, v24.4s\n"
+      "fmin v18.4s, v18.4s, v24.4s\n"
+      "fmin v19.4s, v19.4s, v24.4s\n"
+      "fmin v20.4s, v20.4s, v24.4s\n"
+      "fmin v21.4s, v21.4s, v24.4s\n"
+      "fmin v22.4s, v22.4s, v24.4s\n"
+      "fmin v23.4s, v23.4s, v24.4s\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
-      "fmax v20.4s, v20.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v0.4s\n"
-      "fmax v22.4s, v22.4s, v0.4s\n"
-      "fmax v23.4s, v23.4s, v0.4s\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v24.4s\n"
+      "fmax v9.4s, v9.4s, v24.4s\n"
+      "fmax v10.4s, v10.4s, v24.4s\n"
+      "fmax v11.4s, v11.4s, v24.4s\n"
+      "fmax v12.4s, v12.4s, v24.4s\n"
+      "fmax v13.4s, v13.4s, v24.4s\n"
+      "fmax v14.4s, v14.4s, v24.4s\n"
+      "fmax v15.4s, v15.4s, v24.4s\n"
+      "fmax v16.4s, v16.4s, v24.4s\n"
+      "fmax v17.4s, v17.4s, v24.4s\n"
+      "fmax v18.4s, v18.4s, v24.4s\n"
+      "fmax v19.4s, v19.4s, v24.4s\n"
+      "fmax v20.4s, v20.4s, v24.4s\n"
+      "fmax v21.4s, v21.4s, v24.4s\n"
+      "fmax v22.4s, v22.4s, v24.4s\n"
+      "fmax v23.4s, v23.4s, v24.4s\n"
       "122:"  // Height 4: No activation
       "cmp x8, #0x10\n"
       "bge 131f\n"
@@ -2190,340 +2189,340 @@
       "147:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 148f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x23, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
       "cbnz x15, 149f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20, LSL #2\n"
+      "add x12, x12, x20, LSL #2\n"
+      "add x11, x11, x20, LSL #2\n"
+      "add x10, x10, x20, LSL #2\n"
       "add x9, x9, x20, LSL #2\n"
-      "add x27, x27, x20, LSL #2\n"
-      "add x25, x25, x20, LSL #2\n"
-      "add x23, x23, x20, LSL #2\n"
       "b 149f\n"
       "148:"  // Height 5: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20, LSL #2\n"
-      "add x27, x9, x20, LSL #2\n"
-      "add x25, x27, x20, LSL #2\n"
-      "add x23, x25, x20, LSL #2\n"
+      "add x12, x13, x21, LSL #2\n"
+      "add x11, x12, x21, LSL #2\n"
+      "add x10, x11, x21, LSL #2\n"
+      "add x9, x10, x21, LSL #2\n"
       "149:"  // Height 5: input setup done
       "cmp x14, #0x4\n"
       "blt 152f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x8\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x23, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
       "ldr q7, [x17, #0x10]\n"
       "blt 151f\n"
       "150:"  // Height 5: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr x12, [x17, #0x28]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "ldr d6, [x17, #0x20]\n"
+      "ldr d29, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v29.d[1], x21\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr x12, [x17, #0x48]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr d7, [x17, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x58]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "add x23, x23, #0x10\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "ldr x10, [x13, #0x8]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "ldr d6, [x17, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr x12, [x17, #0x68]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "ldr x28, [x9, #0x8]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "ldr x26, [x27, #0x8]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "ldr d7, [x17, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr x11, [x17, #0x78]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "ldr x24, [x25, #0x8]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "ldr x22, [x23, #0x8]\n"
-      "fmla v24.4s, v6.4s, v4.s[1]\n"
-      "ldr d6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr x12, [x17, #0x88]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "ldr d28, [x17, #0x30]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "ldr x20, [x17, #0x58]\n"
+      "fmla v18.4s, v29.4s, v2.s[0]\n"
+      "add x9, x9, #0x10\n"
+      "fmla v22.4s, v29.4s, v3.s[0]\n"
+      "ldr x26, [x13, #0x8]\n"
+      "fmla v26.4s, v29.4s, v4.s[0]\n"
+      "ldr d29, [x17, #0x40]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "ldr x21, [x17, #0x68]\n"
+      "fmla v19.4s, v28.4s, v2.s[0]\n"
+      "ldr x25, [x12, #0x8]\n"
+      "fmla v23.4s, v28.4s, v3.s[0]\n"
+      "ldr x24, [x11, #0x8]\n"
+      "fmla v27.4s, v28.4s, v4.s[0]\n"
+      "ldr d28, [x17, #0x50]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.4s, v29.4s, v0.s[1]\n"
+      "fmla v12.4s, v29.4s, v1.s[1]\n"
+      "ldr x20, [x17, #0x78]\n"
+      "fmla v16.4s, v29.4s, v2.s[1]\n"
+      "ldr x23, [x10, #0x8]\n"
+      "fmla v20.4s, v29.4s, v3.s[1]\n"
+      "ldr x22, [x9, #0x8]\n"
+      "fmla v24.4s, v29.4s, v4.s[1]\n"
+      "ldr d29, [x17, #0x60]\n"
+      "fmla v9.4s, v28.4s, v0.s[1]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.4s, v28.4s, v1.s[1]\n"
+      "ldr x21, [x17, #0x88]\n"
+      "fmla v17.4s, v28.4s, v2.s[1]\n"
       "sub x14, x14, #0x4\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "fmla v21.4s, v28.4s, v3.s[1]\n"
       "cmp x14, #0x8\n"
-      "fmla v25.4s, v7.4s, v4.s[1]\n"
-      "ldr d7, [x17, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr x11, [x17, #0x98]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v25.4s, v28.4s, v4.s[1]\n"
+      "ldr d28, [x17, #0x70]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v14.4s, v29.4s, v1.s[1]\n"
+      "ldr x20, [x17, #0x98]\n"
+      "fmla v18.4s, v29.4s, v2.s[1]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "fmla v22.4s, v29.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v26.4s, v29.4s, v4.s[1]\n"
+      "ldr d29, [x17, #0x80]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.4s, v28.4s, v1.s[1]\n"
+      "ldr x21, [x17, #0xa8]\n"
+      "fmla v19.4s, v28.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v23.4s, v28.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v27.4s, v28.4s, v4.s[1]\n"
+      "ldr d28, [x17, #0x90]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v12.4s, v29.4s, v1.s[2]\n"
+      "ldr x20, [x17, #0xb8]\n"
+      "fmla v16.4s, v29.4s, v2.s[2]\n"
       "prfm pldl1keep, [x9, #0x80]\n"
-      "fmla v26.4s, v6.4s, v4.s[1]\n"
-      "ldr d6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr x12, [x17, #0xa8]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v27.4s, v7.4s, v4.s[1]\n"
-      "ldr d7, [x17, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "fmla v24.4s, v6.4s, v4.s[2]\n"
-      "ldr d6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "ldr x12, [x17, #0xc8]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "fmla v25.4s, v7.4s, v4.s[2]\n"
-      "ldr d7, [x17, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "fmla v26.4s, v6.4s, v4.s[2]\n"
-      "ldr d6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "ldr x12, [x17, #0xe8]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "fmla v27.4s, v7.4s, v4.s[2]\n"
-      "ldr d7, [x17, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "fmla v24.4s, v6.4s, v4.s[3]\n"
-      "ldr d6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "mov v6.d[1], x12\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "fmla v25.4s, v7.4s, v4.s[3]\n"
-      "ldr d7, [x17, #0xf0]\n"
-      "mov v7.d[1], x11\n"
+      "fmla v20.4s, v29.4s, v3.s[2]\n"
+      "fmla v24.4s, v29.4s, v4.s[2]\n"
+      "ldr d29, [x17, #0xa0]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.4s, v28.4s, v1.s[2]\n"
+      "ldr x21, [x17, #0xc8]\n"
+      "fmla v17.4s, v28.4s, v2.s[2]\n"
+      "fmla v21.4s, v28.4s, v3.s[2]\n"
+      "fmla v25.4s, v28.4s, v4.s[2]\n"
+      "ldr d28, [x17, #0xb0]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v10.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "ldr x20, [x17, #0xd8]\n"
+      "fmla v18.4s, v29.4s, v2.s[2]\n"
+      "fmla v22.4s, v29.4s, v3.s[2]\n"
+      "fmla v26.4s, v29.4s, v4.s[2]\n"
+      "ldr d29, [x17, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[2]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "ldr x21, [x17, #0xe8]\n"
+      "fmla v19.4s, v28.4s, v2.s[2]\n"
+      "fmla v23.4s, v28.4s, v3.s[2]\n"
+      "fmla v27.4s, v28.4s, v4.s[2]\n"
+      "ldr d28, [x17, #0xd0]\n"
+      "mov v28.d[1], x20\n"
+      "fmla v8.4s, v29.4s, v0.s[3]\n"
+      "fmla v12.4s, v29.4s, v1.s[3]\n"
+      "ldr x20, [x17, #0xf8]\n"
+      "fmla v16.4s, v29.4s, v2.s[3]\n"
+      "fmla v20.4s, v29.4s, v3.s[3]\n"
+      "fmla v24.4s, v29.4s, v4.s[3]\n"
+      "ldr d29, [x17, #0xe0]\n"
+      "fmla v9.4s, v28.4s, v0.s[3]\n"
+      "mov v29.d[1], x21\n"
+      "fmla v13.4s, v28.4s, v1.s[3]\n"
+      "fmla v17.4s, v28.4s, v2.s[3]\n"
+      "fmla v21.4s, v28.4s, v3.s[3]\n"
+      "fmla v25.4s, v28.4s, v4.s[3]\n"
+      "ldr d28, [x17, #0xf0]\n"
+      "mov v28.d[1], x20\n"
       "add x17, x17, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "ldr x12, [x17, #0x8]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "ldr x11, [x17, #0x18]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "ldr x21, [x17, #0x8]\n"
+      "fmla v14.4s, v29.4s, v1.s[3]\n"
+      "ldr x20, [x17, #0x18]\n"
+      "fmla v18.4s, v29.4s, v2.s[3]\n"
+      "fmla v22.4s, v29.4s, v3.s[3]\n"
+      "fmla v26.4s, v29.4s, v4.s[3]\n"
       "ldr d6, [x17, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
       "ldr d0, [x13, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
-      "ldr d3, [x25, #0x0]\n"
-      "fmla v27.4s, v7.4s, v4.s[3]\n"
-      "ldr d4, [x23, #0x0]\n"
+      "fmla v15.4s, v28.4s, v1.s[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "fmla v19.4s, v28.4s, v2.s[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      "fmla v23.4s, v28.4s, v3.s[3]\n"
+      "ldr d3, [x10, #0x0]\n"
+      "fmla v27.4s, v28.4s, v4.s[3]\n"
+      "ldr d4, [x9, #0x0]\n"
       "ldr d7, [x17, #0x10]\n"
-      "mov v6.d[1], x12\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
-      "mov v3.d[1], x24\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x26\n"
+      "mov v1.d[1], x25\n"
+      "mov v2.d[1], x24\n"
+      "mov v3.d[1], x23\n"
       "mov v4.d[1], x22\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "bge 150b\n"
       "151:"  // Height 5: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
+      "ldr q29, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
       "sub x14, x14, #0x4\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x17, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x17, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "fmla v24.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x17, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "fmla v25.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x17, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "fmla v26.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x17, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "fmla v27.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x17, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "fmla v24.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x17, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "fmla v25.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x17, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "fmla v26.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x17, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "fmla v27.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x17, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "fmla v24.4s, v6.4s, v4.s[3]\n"
-      "ldr q6, [x17, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "fmla v25.4s, v7.4s, v4.s[3]\n"
-      "ldr q7, [x17, #0xf0]\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "ldr q28, [x17, #0x30]\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v18.4s, v29.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "fmla v22.4s, v29.4s, v3.s[0]\n"
+      "fmla v26.4s, v29.4s, v4.s[0]\n"
+      "ldr q29, [x17, #0x40]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v2.s[0]\n"
+      "fmla v23.4s, v28.4s, v3.s[0]\n"
+      "fmla v27.4s, v28.4s, v4.s[0]\n"
+      "ldr q28, [x17, #0x50]\n"
+      "fmla v8.4s, v29.4s, v0.s[1]\n"
+      "fmla v12.4s, v29.4s, v1.s[1]\n"
+      "fmla v16.4s, v29.4s, v2.s[1]\n"
+      "fmla v20.4s, v29.4s, v3.s[1]\n"
+      "fmla v24.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x17, #0x60]\n"
+      "fmla v9.4s, v28.4s, v0.s[1]\n"
+      "fmla v13.4s, v28.4s, v1.s[1]\n"
+      "fmla v17.4s, v28.4s, v2.s[1]\n"
+      "fmla v21.4s, v28.4s, v3.s[1]\n"
+      "fmla v25.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x17, #0x70]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v14.4s, v29.4s, v1.s[1]\n"
+      "fmla v18.4s, v29.4s, v2.s[1]\n"
+      "fmla v22.4s, v29.4s, v3.s[1]\n"
+      "fmla v26.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x17, #0x80]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v15.4s, v28.4s, v1.s[1]\n"
+      "fmla v19.4s, v28.4s, v2.s[1]\n"
+      "fmla v23.4s, v28.4s, v3.s[1]\n"
+      "fmla v27.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x17, #0x90]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v12.4s, v29.4s, v1.s[2]\n"
+      "fmla v16.4s, v29.4s, v2.s[2]\n"
+      "fmla v20.4s, v29.4s, v3.s[2]\n"
+      "fmla v24.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x17, #0xa0]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v13.4s, v28.4s, v1.s[2]\n"
+      "fmla v17.4s, v28.4s, v2.s[2]\n"
+      "fmla v21.4s, v28.4s, v3.s[2]\n"
+      "fmla v25.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x17, #0xb0]\n"
+      "fmla v10.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v18.4s, v29.4s, v2.s[2]\n"
+      "fmla v22.4s, v29.4s, v3.s[2]\n"
+      "fmla v26.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x17, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[2]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v19.4s, v28.4s, v2.s[2]\n"
+      "fmla v23.4s, v28.4s, v3.s[2]\n"
+      "fmla v27.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x17, #0xd0]\n"
+      "fmla v8.4s, v29.4s, v0.s[3]\n"
+      "fmla v12.4s, v29.4s, v1.s[3]\n"
+      "fmla v16.4s, v29.4s, v2.s[3]\n"
+      "fmla v20.4s, v29.4s, v3.s[3]\n"
+      "fmla v24.4s, v29.4s, v4.s[3]\n"
+      "ldr q29, [x17, #0xe0]\n"
+      "fmla v9.4s, v28.4s, v0.s[3]\n"
+      "fmla v13.4s, v28.4s, v1.s[3]\n"
+      "fmla v17.4s, v28.4s, v2.s[3]\n"
+      "fmla v21.4s, v28.4s, v3.s[3]\n"
+      "fmla v25.4s, v28.4s, v4.s[3]\n"
+      "ldr q28, [x17, #0xf0]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
       "add x17, x17, #0x100\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v26.4s, v6.4s, v4.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
-      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "fmla v14.4s, v29.4s, v1.s[3]\n"
+      "fmla v18.4s, v29.4s, v2.s[3]\n"
+      "fmla v22.4s, v29.4s, v3.s[3]\n"
+      "fmla v26.4s, v29.4s, v4.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "fmla v15.4s, v28.4s, v1.s[3]\n"
+      "fmla v19.4s, v28.4s, v2.s[3]\n"
+      "fmla v23.4s, v28.4s, v3.s[3]\n"
+      "fmla v27.4s, v28.4s, v4.s[3]\n"
       "152:"  // Height 5: Multiply loop: Main loop skip
       "cbz x14, 154f\n"
       "153:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s2, [x13], #0x4\n"
       "sub x14, x14, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x23], #0x4\n"
-      "ldr q6, [x17, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr s0, [x11], #0x4\n"
+      "ldr s31, [x10], #0x4\n"
+      "ldr s30, [x9], #0x4\n"
+      "ldr q29, [x17, #0x0]\n"
+      "fmla v8.4s, v29.4s, v2.s[0]\n"
+      "ldr q28, [x17, #0x10]\n"
+      "fmla v12.4s, v29.4s, v1.s[0]\n"
+      "fmla v16.4s, v29.4s, v0.s[0]\n"
+      "fmla v20.4s, v29.4s, v31.s[0]\n"
+      "fmla v24.4s, v29.4s, v30.s[0]\n"
+      "ldr q29, [x17, #0x20]\n"
+      "fmla v9.4s, v28.4s, v2.s[0]\n"
+      "fmla v13.4s, v28.4s, v1.s[0]\n"
+      "fmla v17.4s, v28.4s, v0.s[0]\n"
+      "fmla v21.4s, v28.4s, v31.s[0]\n"
+      "fmla v25.4s, v28.4s, v30.s[0]\n"
+      "ldr q28, [x17, #0x30]\n"
+      "fmla v10.4s, v29.4s, v2.s[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "fmla v18.4s, v29.4s, v0.s[0]\n"
+      "fmla v22.4s, v29.4s, v31.s[0]\n"
+      "fmla v26.4s, v29.4s, v30.s[0]\n"
+      "fmla v11.4s, v28.4s, v2.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v0.s[0]\n"
+      "fmla v23.4s, v28.4s, v31.s[0]\n"
+      "fmla v27.4s, v28.4s, v30.s[0]\n"
       "cbnz x14, 153b\n"
       "154:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2542,49 +2541,49 @@
       "prfm pstl1keep, [x22, #0x0]\n"
       "tbz %x[flags], #1, 155f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v0.4s\n"
-      "fmin v9.4s, v9.4s, v0.4s\n"
-      "fmin v10.4s, v10.4s, v0.4s\n"
-      "fmin v11.4s, v11.4s, v0.4s\n"
-      "fmin v12.4s, v12.4s, v0.4s\n"
-      "fmin v13.4s, v13.4s, v0.4s\n"
-      "fmin v14.4s, v14.4s, v0.4s\n"
-      "fmin v15.4s, v15.4s, v0.4s\n"
-      "fmin v16.4s, v16.4s, v0.4s\n"
-      "fmin v17.4s, v17.4s, v0.4s\n"
-      "fmin v18.4s, v18.4s, v0.4s\n"
-      "fmin v19.4s, v19.4s, v0.4s\n"
-      "fmin v20.4s, v20.4s, v0.4s\n"
-      "fmin v21.4s, v21.4s, v0.4s\n"
-      "fmin v22.4s, v22.4s, v0.4s\n"
-      "fmin v23.4s, v23.4s, v0.4s\n"
-      "fmin v24.4s, v24.4s, v0.4s\n"
-      "fmin v25.4s, v25.4s, v0.4s\n"
-      "fmin v26.4s, v26.4s, v0.4s\n"
-      "fmin v27.4s, v27.4s, v0.4s\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v28.4s\n"
+      "fmin v9.4s, v9.4s, v28.4s\n"
+      "fmin v10.4s, v10.4s, v28.4s\n"
+      "fmin v11.4s, v11.4s, v28.4s\n"
+      "fmin v12.4s, v12.4s, v28.4s\n"
+      "fmin v13.4s, v13.4s, v28.4s\n"
+      "fmin v14.4s, v14.4s, v28.4s\n"
+      "fmin v15.4s, v15.4s, v28.4s\n"
+      "fmin v16.4s, v16.4s, v28.4s\n"
+      "fmin v17.4s, v17.4s, v28.4s\n"
+      "fmin v18.4s, v18.4s, v28.4s\n"
+      "fmin v19.4s, v19.4s, v28.4s\n"
+      "fmin v20.4s, v20.4s, v28.4s\n"
+      "fmin v21.4s, v21.4s, v28.4s\n"
+      "fmin v22.4s, v22.4s, v28.4s\n"
+      "fmin v23.4s, v23.4s, v28.4s\n"
+      "fmin v24.4s, v24.4s, v28.4s\n"
+      "fmin v25.4s, v25.4s, v28.4s\n"
+      "fmin v26.4s, v26.4s, v28.4s\n"
+      "fmin v27.4s, v27.4s, v28.4s\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
-      "fmax v20.4s, v20.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v0.4s\n"
-      "fmax v22.4s, v22.4s, v0.4s\n"
-      "fmax v23.4s, v23.4s, v0.4s\n"
-      "fmax v24.4s, v24.4s, v0.4s\n"
-      "fmax v25.4s, v25.4s, v0.4s\n"
-      "fmax v26.4s, v26.4s, v0.4s\n"
-      "fmax v27.4s, v27.4s, v0.4s\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "fmax v8.4s, v8.4s, v28.4s\n"
+      "fmax v9.4s, v9.4s, v28.4s\n"
+      "fmax v10.4s, v10.4s, v28.4s\n"
+      "fmax v11.4s, v11.4s, v28.4s\n"
+      "fmax v12.4s, v12.4s, v28.4s\n"
+      "fmax v13.4s, v13.4s, v28.4s\n"
+      "fmax v14.4s, v14.4s, v28.4s\n"
+      "fmax v15.4s, v15.4s, v28.4s\n"
+      "fmax v16.4s, v16.4s, v28.4s\n"
+      "fmax v17.4s, v17.4s, v28.4s\n"
+      "fmax v18.4s, v18.4s, v28.4s\n"
+      "fmax v19.4s, v19.4s, v28.4s\n"
+      "fmax v20.4s, v20.4s, v28.4s\n"
+      "fmax v21.4s, v21.4s, v28.4s\n"
+      "fmax v22.4s, v22.4s, v28.4s\n"
+      "fmax v23.4s, v23.4s, v28.4s\n"
+      "fmax v24.4s, v24.4s, v28.4s\n"
+      "fmax v25.4s, v25.4s, v28.4s\n"
+      "fmax v26.4s, v26.4s, v28.4s\n"
+      "fmax v27.4s, v27.4s, v28.4s\n"
       "155:"  // Height 5: No activation
       "cmp x8, #0x10\n"
       "bge 164f\n"
@@ -2961,98 +2960,98 @@
       "180:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 181f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x23, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
+      "ldr x28, [x20, #0x28]\n"
       "cbnz x15, 182f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20, LSL #2\n"
+      "add x12, x12, x20, LSL #2\n"
+      "add x11, x11, x20, LSL #2\n"
+      "add x10, x10, x20, LSL #2\n"
       "add x9, x9, x20, LSL #2\n"
-      "add x27, x27, x20, LSL #2\n"
-      "add x25, x25, x20, LSL #2\n"
-      "add x23, x23, x20, LSL #2\n"
-      "add x21, x21, x20, LSL #2\n"
+      "add x28, x28, x20, LSL #2\n"
       "b 182f\n"
       "181:"  // Height 6: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20, LSL #2\n"
-      "add x27, x9, x20, LSL #2\n"
-      "add x25, x27, x20, LSL #2\n"
-      "add x23, x25, x20, LSL #2\n"
-      "add x21, x23, x20, LSL #2\n"
+      "add x12, x13, x21, LSL #2\n"
+      "add x11, x12, x21, LSL #2\n"
+      "add x10, x11, x21, LSL #2\n"
+      "add x9, x10, x21, LSL #2\n"
+      "add x28, x9, x21, LSL #2\n"
       "182:"  // Height 6: input setup done
       "cmp x14, #0x4\n"
       "blt 185f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x8\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x23, #0x0]\n"
-      "ldr q5, [x21, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
       "ldr q6, [x17, #0x0]\n"
       "ldr q7, [x17, #0x10]\n"
       "blt 184f\n"
       "183:"  // Height 6: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr x12, [x17, #0x28]\n"
+      "ldr x21, [x17, #0x28]\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x38]\n"
+      "ldr x20, [x17, #0x38]\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v28.4s, v6.4s, v5.s[0]\n"
       "ldr d6, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr x12, [x17, #0x48]\n"
+      "ldr x21, [x17, #0x48]\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "add x21, x21, #0x10\n"
+      "add x28, x28, #0x10\n"
       "fmla v29.4s, v7.4s, v5.s[0]\n"
       "ldr d7, [x17, #0x30]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
       "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr x11, [x17, #0x58]\n"
+      "ldr x20, [x17, #0x58]\n"
       "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "ldr x10, [x13, #0x8]\n"
+      "ldr x27, [x13, #0x8]\n"
       "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "ldr x28, [x9, #0x8]\n"
+      "ldr x26, [x12, #0x8]\n"
       "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "ldr x26, [x27, #0x8]\n"
+      "ldr x25, [x11, #0x8]\n"
       "fmla v30.4s, v6.4s, v5.s[0]\n"
       "ldr d6, [x17, #0x40]\n"
       "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr x12, [x17, #0x68]\n"
+      "ldr x21, [x17, #0x68]\n"
       "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "ldr x24, [x25, #0x8]\n"
+      "ldr x24, [x10, #0x8]\n"
       "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "ldr x22, [x23, #0x8]\n"
+      "ldr x23, [x9, #0x8]\n"
       "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "ldr x20, [x21, #0x8]\n"
+      "ldr x22, [x28, #0x8]\n"
       "fmla v31.4s, v7.4s, v5.s[0]\n"
       "ldr d7, [x17, #0x50]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.4s, v6.4s, v0.s[1]\n"
       "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr x11, [x17, #0x78]\n"
+      "ldr x20, [x17, #0x78]\n"
       "fmla v16.4s, v6.4s, v2.s[1]\n"
       "sub x14, x14, #0x4\n"
       "fmla v20.4s, v6.4s, v3.s[1]\n"
@@ -3062,96 +3061,96 @@
       "fmla v28.4s, v6.4s, v5.s[1]\n"
       "ldr d6, [x17, #0x60]\n"
       "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr x12, [x17, #0x88]\n"
+      "ldr x21, [x17, #0x88]\n"
       "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v25.4s, v7.4s, v4.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       "fmla v29.4s, v7.4s, v5.s[1]\n"
       "ldr d7, [x17, #0x70]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.4s, v6.4s, v0.s[1]\n"
       "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr x11, [x17, #0x98]\n"
+      "ldr x20, [x17, #0x98]\n"
       "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v26.4s, v6.4s, v4.s[1]\n"
       "fmla v30.4s, v6.4s, v5.s[1]\n"
       "ldr d6, [x17, #0x80]\n"
       "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr x12, [x17, #0xa8]\n"
+      "ldr x21, [x17, #0xa8]\n"
       "fmla v19.4s, v7.4s, v2.s[1]\n"
       "fmla v23.4s, v7.4s, v3.s[1]\n"
       "fmla v27.4s, v7.4s, v4.s[1]\n"
       "fmla v31.4s, v7.4s, v5.s[1]\n"
       "ldr d7, [x17, #0x90]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.4s, v6.4s, v0.s[2]\n"
       "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "ldr x11, [x17, #0xb8]\n"
+      "ldr x20, [x17, #0xb8]\n"
       "fmla v16.4s, v6.4s, v2.s[2]\n"
       "fmla v20.4s, v6.4s, v3.s[2]\n"
       "fmla v24.4s, v6.4s, v4.s[2]\n"
       "fmla v28.4s, v6.4s, v5.s[2]\n"
       "ldr d6, [x17, #0xa0]\n"
       "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "ldr x12, [x17, #0xc8]\n"
+      "ldr x21, [x17, #0xc8]\n"
       "fmla v17.4s, v7.4s, v2.s[2]\n"
       "fmla v21.4s, v7.4s, v3.s[2]\n"
       "fmla v25.4s, v7.4s, v4.s[2]\n"
       "fmla v29.4s, v7.4s, v5.s[2]\n"
       "ldr d7, [x17, #0xb0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v10.4s, v6.4s, v0.s[2]\n"
       "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "ldr x11, [x17, #0xd8]\n"
+      "ldr x20, [x17, #0xd8]\n"
       "fmla v18.4s, v6.4s, v2.s[2]\n"
       "fmla v22.4s, v6.4s, v3.s[2]\n"
       "fmla v26.4s, v6.4s, v4.s[2]\n"
       "fmla v30.4s, v6.4s, v5.s[2]\n"
       "ldr d6, [x17, #0xc0]\n"
       "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "ldr x12, [x17, #0xe8]\n"
+      "ldr x21, [x17, #0xe8]\n"
       "fmla v19.4s, v7.4s, v2.s[2]\n"
       "fmla v23.4s, v7.4s, v3.s[2]\n"
       "fmla v27.4s, v7.4s, v4.s[2]\n"
       "fmla v31.4s, v7.4s, v5.s[2]\n"
       "ldr d7, [x17, #0xd0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "fmla v8.4s, v6.4s, v0.s[3]\n"
       "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "ldr x11, [x17, #0xf8]\n"
+      "ldr x20, [x17, #0xf8]\n"
       "fmla v16.4s, v6.4s, v2.s[3]\n"
       "fmla v20.4s, v6.4s, v3.s[3]\n"
       "fmla v24.4s, v6.4s, v4.s[3]\n"
       "fmla v28.4s, v6.4s, v5.s[3]\n"
       "ldr d6, [x17, #0xe0]\n"
       "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       "fmla v13.4s, v7.4s, v1.s[3]\n"
       "fmla v17.4s, v7.4s, v2.s[3]\n"
       "fmla v21.4s, v7.4s, v3.s[3]\n"
       "fmla v25.4s, v7.4s, v4.s[3]\n"
       "fmla v29.4s, v7.4s, v5.s[3]\n"
       "ldr d7, [x17, #0xf0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "add x17, x17, #0x100\n"
       "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "ldr x12, [x17, #0x8]\n"
+      "ldr x21, [x17, #0x8]\n"
       "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "ldr x11, [x17, #0x18]\n"
+      "ldr x20, [x17, #0x18]\n"
       "fmla v18.4s, v6.4s, v2.s[3]\n"
       "fmla v22.4s, v6.4s, v3.s[3]\n"
       "fmla v26.4s, v6.4s, v4.s[3]\n"
@@ -3160,56 +3159,56 @@
       "fmla v11.4s, v7.4s, v0.s[3]\n"
       "ldr d0, [x13, #0x0]\n"
       "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d1, [x12, #0x0]\n"
       "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "ldr d2, [x11, #0x0]\n"
       "fmla v23.4s, v7.4s, v3.s[3]\n"
-      "ldr d3, [x25, #0x0]\n"
+      "ldr d3, [x10, #0x0]\n"
       "fmla v27.4s, v7.4s, v4.s[3]\n"
-      "ldr d4, [x23, #0x0]\n"
+      "ldr d4, [x9, #0x0]\n"
       "fmla v31.4s, v7.4s, v5.s[3]\n"
-      "ldr d5, [x21, #0x0]\n"
+      "ldr d5, [x28, #0x0]\n"
       "ldr d7, [x17, #0x10]\n"
-      "mov v6.d[1], x12\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x27\n"
+      "mov v1.d[1], x26\n"
+      "mov v2.d[1], x25\n"
       "mov v3.d[1], x24\n"
-      "mov v4.d[1], x22\n"
-      "mov v5.d[1], x20\n"
-      "mov v7.d[1], x11\n"
+      "mov v4.d[1], x23\n"
+      "mov v5.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 183b\n"
       "184:"  // Height 6: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
       "add x13, x13, #0x10\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       "fmla v28.4s, v6.4s, v5.s[0]\n"
       "ldr q6, [x17, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "add x21, x21, #0x10\n"
+      "add x28, x28, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
       "sub x14, x14, #0x4\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v29.4s, v7.4s, v5.s[0]\n"
       "ldr q7, [x17, #0x30]\n"
       "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "fmla v22.4s, v6.4s, v3.s[0]\n"
       "fmla v26.4s, v6.4s, v4.s[0]\n"
       "fmla v30.4s, v6.4s, v5.s[0]\n"
@@ -3307,42 +3306,42 @@
       "185:"  // Height 6: Multiply loop: Main loop skip
       "cbz x14, 187f\n"
       "186:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s7, [x13], #0x4\n"
       "sub x14, x14, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x23], #0x4\n"
-      "ldr s5, [x21], #0x4\n"
-      "ldr q6, [x17, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x17, #0x10]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "fmla v28.4s, v6.4s, v5.s[0]\n"
-      "ldr q6, [x17, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "fmla v29.4s, v7.4s, v5.s[0]\n"
-      "ldr q7, [x17, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr s6, [x12], #0x4\n"
+      "ldr s5, [x11], #0x4\n"
+      "ldr s4, [x10], #0x4\n"
+      "ldr s3, [x9], #0x4\n"
+      "ldr s2, [x28], #0x4\n"
+      "ldr q1, [x17, #0x0]\n"
+      "fmla v8.4s, v1.4s, v7.s[0]\n"
+      "ldr q0, [x17, #0x10]\n"
+      "fmla v12.4s, v1.4s, v6.s[0]\n"
+      "fmla v16.4s, v1.4s, v5.s[0]\n"
+      "fmla v20.4s, v1.4s, v4.s[0]\n"
+      "fmla v24.4s, v1.4s, v3.s[0]\n"
+      "fmla v28.4s, v1.4s, v2.s[0]\n"
+      "ldr q1, [x17, #0x20]\n"
+      "fmla v9.4s, v0.4s, v7.s[0]\n"
+      "fmla v13.4s, v0.4s, v6.s[0]\n"
+      "fmla v17.4s, v0.4s, v5.s[0]\n"
+      "fmla v21.4s, v0.4s, v4.s[0]\n"
+      "fmla v25.4s, v0.4s, v3.s[0]\n"
+      "fmla v29.4s, v0.4s, v2.s[0]\n"
+      "ldr q0, [x17, #0x30]\n"
+      "fmla v10.4s, v1.4s, v7.s[0]\n"
       "add x17, x17, #0x40\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "fmla v30.4s, v6.4s, v5.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "fmla v31.4s, v7.4s, v5.s[0]\n"
+      "fmla v14.4s, v1.4s, v6.s[0]\n"
+      "fmla v18.4s, v1.4s, v5.s[0]\n"
+      "fmla v22.4s, v1.4s, v4.s[0]\n"
+      "fmla v26.4s, v1.4s, v3.s[0]\n"
+      "fmla v30.4s, v1.4s, v2.s[0]\n"
+      "fmla v11.4s, v0.4s, v7.s[0]\n"
+      "fmla v15.4s, v0.4s, v6.s[0]\n"
+      "fmla v19.4s, v0.4s, v5.s[0]\n"
+      "fmla v23.4s, v0.4s, v4.s[0]\n"
+      "fmla v27.4s, v0.4s, v3.s[0]\n"
+      "fmla v31.4s, v0.4s, v2.s[0]\n"
       "cbnz x14, 186b\n"
       "187:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -3584,7 +3583,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "200:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
index c5e4388..bb84a50 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
@@ -92,7 +92,6 @@
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 166f\n"
@@ -189,11 +188,11 @@
       "15:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 16f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 17f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -210,37 +209,37 @@
       "blt 19f\n"
       "18:"  // Height 1: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "sub x27, x27, #0x4\n"
       "add x26, x26, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
       "ldr q0, [x26, #0x0]\n"
       "cmp x27, #0x8\n"
       "add x10, x10, #0x100\n"
@@ -250,52 +249,52 @@
       "bge 18b\n"
       "19:"  // Height 1: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x26, x26, #0x10\n"
       "sub x27, x27, #0x4\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "add x10, x10, #0x100\n"
       "20:"  // Height 1: Multiply loop: Main loop skip
       "cbz x27, 22f\n"
       "21:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr s18, [x26], #0x4\n"
+      "ldr q16, [x10, #0x0]\n"
+      "fmla v8.4s, v16.4s, v18.s[0]\n"
       "sub x27, x27, #0x1\n"
-      "ldr q7, [x10, #0x10]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "ldr q16, [x10, #0x20]\n"
+      "fmla v9.4s, v17.4s, v18.s[0]\n"
+      "fmla v10.4s, v16.4s, v18.s[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v11.4s, v16.4s, v18.s[0]\n"
       "add x10, x10, #0x40\n"
       "cbnz x27, 21b\n"
       "22:"  // Height 1: Multiply loop: No odd multiplies
@@ -306,17 +305,17 @@
       "prfm pstl1keep, [x9, #0x0]\n"
       "tbz %x[flags], #1, 23f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v17.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v17.4s\n"
+      "fmin v9.4s, v9.4s, v17.4s\n"
+      "fmin v10.4s, v10.4s, v17.4s\n"
+      "fmin v11.4s, v11.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
       "23:"  // Height 1: No activation
       "cmp x11, #0x10\n"
       "bge 32f\n"
@@ -494,12 +493,12 @@
       "48:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 49f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 50f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -507,7 +506,7 @@
       "b 50f\n"
       "49:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
       "50:"  // Height 2: input setup done
       "cmp x27, #0x4\n"
       "blt 53f\n"
@@ -520,134 +519,134 @@
       "51:"  // Height 2: Multiply loop: Main loop head
       "fmla v8.4s, v6.4s, v0.s[0]\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       "sub x27, x27, #0x4\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q16, [x10, #0x30]\n"
       "add x26, x26, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "fmla v14.4s, v17.4s, v1.s[0]\n"
+      "ldr q17, [x10, #0x40]\n"
       "add x25, x25, #0x10\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "fmla v15.4s, v16.4s, v1.s[0]\n"
+      "ldr q16, [x10, #0x50]\n"
       "cmp x27, #0x8\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "fmla v12.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x10, #0x60]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "fmla v13.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x10, #0x70]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "fmla v14.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "fmla v15.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "fmla v12.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "fmla v13.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "fmla v14.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "fmla v15.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "fmla v12.4s, v17.4s, v1.s[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v1.s[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v14.4s, v17.4s, v1.s[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v15.4s, v16.4s, v1.s[3]\n"
       "ldr q1, [x25, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 51b\n"
       "52:"  // Height 2: Multiply loop: Single iteration only
       "fmla v8.4s, v6.4s, v0.s[0]\n"
       "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       "add x26, x26, #0x10\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q16, [x10, #0x30]\n"
       "add x25, x25, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      "fmla v10.4s, v17.4s, v0.s[0]\n"
+      "fmla v14.4s, v17.4s, v1.s[0]\n"
+      "ldr q17, [x10, #0x40]\n"
       "sub x27, x27, #0x4\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      "fmla v11.4s, v16.4s, v0.s[0]\n"
+      "fmla v15.4s, v16.4s, v1.s[0]\n"
+      "ldr q16, [x10, #0x50]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
+      "fmla v8.4s, v17.4s, v0.s[1]\n"
+      "fmla v12.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x10, #0x60]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "fmla v9.4s, v16.4s, v0.s[1]\n"
+      "fmla v13.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      "fmla v10.4s, v17.4s, v0.s[1]\n"
+      "fmla v14.4s, v17.4s, v1.s[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      "fmla v11.4s, v16.4s, v0.s[1]\n"
+      "fmla v15.4s, v16.4s, v1.s[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      "fmla v8.4s, v17.4s, v0.s[2]\n"
+      "fmla v12.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      "fmla v9.4s, v16.4s, v0.s[2]\n"
+      "fmla v13.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      "fmla v10.4s, v17.4s, v0.s[2]\n"
+      "fmla v14.4s, v17.4s, v1.s[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      "fmla v11.4s, v16.4s, v0.s[2]\n"
+      "fmla v15.4s, v16.4s, v1.s[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      "fmla v8.4s, v17.4s, v0.s[3]\n"
+      "fmla v12.4s, v17.4s, v1.s[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      "fmla v9.4s, v16.4s, v0.s[3]\n"
+      "fmla v13.4s, v16.4s, v1.s[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v10.4s, v17.4s, v0.s[3]\n"
+      "fmla v14.4s, v17.4s, v1.s[3]\n"
+      "fmla v11.4s, v16.4s, v0.s[3]\n"
+      "fmla v15.4s, v16.4s, v1.s[3]\n"
       "53:"  // Height 2: Multiply loop: Main loop skip
       "cbz x27, 55f\n"
       "54:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s19, [x26], #0x4\n"
+      "ldr s18, [x25], #0x4\n"
       "sub x27, x27, #0x1\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      "fmla v8.4s, v17.4s, v19.s[0]\n"
+      "fmla v12.4s, v17.4s, v18.s[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "fmla v9.4s, v16.4s, v19.s[0]\n"
+      "fmla v13.4s, v16.4s, v18.s[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      "fmla v10.4s, v17.4s, v19.s[0]\n"
+      "fmla v14.4s, v17.4s, v18.s[0]\n"
       "add x10, x10, #0x40\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v11.4s, v16.4s, v19.s[0]\n"
+      "fmla v15.4s, v16.4s, v18.s[0]\n"
       "cbnz x27, 54b\n"
       "55:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -660,25 +659,25 @@
       "prfm pstl1keep, [x25, #0x0]\n"
       "tbz %x[flags], #1, 56f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v17.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v17.4s\n"
+      "fmin v9.4s, v9.4s, v17.4s\n"
+      "fmin v10.4s, v10.4s, v17.4s\n"
+      "fmin v11.4s, v11.4s, v17.4s\n"
+      "fmin v12.4s, v12.4s, v17.4s\n"
+      "fmin v13.4s, v13.4s, v17.4s\n"
+      "fmin v14.4s, v14.4s, v17.4s\n"
+      "fmin v15.4s, v15.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v16.4s\n"
+      "fmax v9.4s, v9.4s, v16.4s\n"
+      "fmax v10.4s, v10.4s, v16.4s\n"
+      "fmax v11.4s, v11.4s, v16.4s\n"
+      "fmax v12.4s, v12.4s, v16.4s\n"
+      "fmax v13.4s, v13.4s, v16.4s\n"
+      "fmax v14.4s, v14.4s, v16.4s\n"
+      "fmax v15.4s, v15.4s, v16.4s\n"
       "56:"  // Height 2: No activation
       "cmp x11, #0x10\n"
       "bge 65f\n"
@@ -905,13 +904,13 @@
       "81:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 82f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 83f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -920,8 +919,8 @@
       "b 83f\n"
       "82:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "83:"  // Height 3: input setup done
       "cmp x27, #0x4\n"
       "blt 86f\n"
@@ -938,75 +937,75 @@
       "sub x27, x27, #0x4\n"
       "add x26, x26, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q21, [x10, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "add x25, x25, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x24, x24, #0x10\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v10.4s, v21.4s, v0.s[0]\n"
+      "fmla v14.4s, v21.4s, v1.s[0]\n"
       "cmp x27, #0x8\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v18.4s, v21.4s, v2.s[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      "fmla v11.4s, v20.4s, v0.s[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      "fmla v15.4s, v20.4s, v1.s[0]\n"
+      "fmla v19.4s, v20.4s, v2.s[0]\n"
+      "ldr q20, [x10, #0x50]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v12.4s, v21.4s, v1.s[1]\n"
+      "fmla v16.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v13.4s, v20.4s, v1.s[1]\n"
+      "fmla v17.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      "fmla v10.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "fmla v18.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      "fmla v11.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "fmla v19.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      "fmla v8.4s, v21.4s, v0.s[2]\n"
+      "fmla v12.4s, v21.4s, v1.s[2]\n"
+      "fmla v16.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[2]\n"
+      "fmla v13.4s, v20.4s, v1.s[2]\n"
+      "fmla v17.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v14.4s, v21.4s, v1.s[2]\n"
+      "fmla v18.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "fmla v15.4s, v20.4s, v1.s[2]\n"
+      "fmla v19.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v12.4s, v21.4s, v1.s[3]\n"
+      "fmla v16.4s, v21.4s, v2.s[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "fmla v13.4s, v20.4s, v1.s[3]\n"
+      "fmla v17.4s, v20.4s, v2.s[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v10.4s, v21.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "fmla v18.4s, v21.4s, v2.s[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v11.4s, v20.4s, v0.s[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v19.4s, v20.4s, v2.s[3]\n"
       "ldr q2, [x24, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 84b\n"
@@ -1016,95 +1015,95 @@
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q21, [x10, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "add x24, x24, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q20, [x10, #0x30]\n"
       "sub x27, x27, #0x4\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v10.4s, v21.4s, v0.s[0]\n"
+      "fmla v14.4s, v21.4s, v1.s[0]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v18.4s, v21.4s, v2.s[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      "fmla v11.4s, v20.4s, v0.s[0]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "fmla v15.4s, v20.4s, v1.s[0]\n"
+      "fmla v19.4s, v20.4s, v2.s[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      "fmla v8.4s, v21.4s, v0.s[1]\n"
+      "fmla v12.4s, v21.4s, v1.s[1]\n"
+      "fmla v16.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      "fmla v9.4s, v20.4s, v0.s[1]\n"
+      "fmla v13.4s, v20.4s, v1.s[1]\n"
+      "fmla v17.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      "fmla v10.4s, v21.4s, v0.s[1]\n"
+      "fmla v14.4s, v21.4s, v1.s[1]\n"
+      "fmla v18.4s, v21.4s, v2.s[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      "fmla v11.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v20.4s, v1.s[1]\n"
+      "fmla v19.4s, v20.4s, v2.s[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      "fmla v8.4s, v21.4s, v0.s[2]\n"
+      "fmla v12.4s, v21.4s, v1.s[2]\n"
+      "fmla v16.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      "fmla v9.4s, v20.4s, v0.s[2]\n"
+      "fmla v13.4s, v20.4s, v1.s[2]\n"
+      "fmla v17.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      "fmla v10.4s, v21.4s, v0.s[2]\n"
+      "fmla v14.4s, v21.4s, v1.s[2]\n"
+      "fmla v18.4s, v21.4s, v2.s[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      "fmla v11.4s, v20.4s, v0.s[2]\n"
+      "fmla v15.4s, v20.4s, v1.s[2]\n"
+      "fmla v19.4s, v20.4s, v2.s[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      "fmla v8.4s, v21.4s, v0.s[3]\n"
+      "fmla v12.4s, v21.4s, v1.s[3]\n"
+      "fmla v16.4s, v21.4s, v2.s[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      "fmla v9.4s, v20.4s, v0.s[3]\n"
+      "fmla v13.4s, v20.4s, v1.s[3]\n"
+      "fmla v17.4s, v20.4s, v2.s[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v10.4s, v21.4s, v0.s[3]\n"
+      "fmla v14.4s, v21.4s, v1.s[3]\n"
+      "fmla v18.4s, v21.4s, v2.s[3]\n"
+      "fmla v11.4s, v20.4s, v0.s[3]\n"
+      "fmla v15.4s, v20.4s, v1.s[3]\n"
+      "fmla v19.4s, v20.4s, v2.s[3]\n"
       "86:"  // Height 3: Multiply loop: Main loop skip
       "cbz x27, 88f\n"
       "87:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
       "sub x27, x27, #0x1\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr q21, [x10, #0x0]\n"
+      "fmla v8.4s, v21.4s, v24.s[0]\n"
+      "fmla v12.4s, v21.4s, v23.s[0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      "fmla v16.4s, v21.4s, v22.s[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      "fmla v9.4s, v20.4s, v24.s[0]\n"
+      "fmla v13.4s, v20.4s, v23.s[0]\n"
+      "fmla v17.4s, v20.4s, v22.s[0]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v10.4s, v21.4s, v24.s[0]\n"
+      "fmla v14.4s, v21.4s, v23.s[0]\n"
+      "fmla v18.4s, v21.4s, v22.s[0]\n"
+      "fmla v11.4s, v20.4s, v24.s[0]\n"
+      "fmla v15.4s, v20.4s, v23.s[0]\n"
+      "fmla v19.4s, v20.4s, v22.s[0]\n"
       "cbnz x27, 87b\n"
       "88:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1119,33 +1118,33 @@
       "prfm pstl1keep, [x24, #0x0]\n"
       "tbz %x[flags], #1, 89f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v21.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v21.4s\n"
+      "fmin v9.4s, v9.4s, v21.4s\n"
+      "fmin v10.4s, v10.4s, v21.4s\n"
+      "fmin v11.4s, v11.4s, v21.4s\n"
+      "fmin v12.4s, v12.4s, v21.4s\n"
+      "fmin v13.4s, v13.4s, v21.4s\n"
+      "fmin v14.4s, v14.4s, v21.4s\n"
+      "fmin v15.4s, v15.4s, v21.4s\n"
+      "fmin v16.4s, v16.4s, v21.4s\n"
+      "fmin v17.4s, v17.4s, v21.4s\n"
+      "fmin v18.4s, v18.4s, v21.4s\n"
+      "fmin v19.4s, v19.4s, v21.4s\n"
+      "fmax v8.4s, v8.4s, v20.4s\n"
+      "fmax v9.4s, v9.4s, v20.4s\n"
+      "fmax v10.4s, v10.4s, v20.4s\n"
+      "fmax v11.4s, v11.4s, v20.4s\n"
+      "fmax v12.4s, v12.4s, v20.4s\n"
+      "fmax v13.4s, v13.4s, v20.4s\n"
+      "fmax v14.4s, v14.4s, v20.4s\n"
+      "fmax v15.4s, v15.4s, v20.4s\n"
+      "fmax v16.4s, v16.4s, v20.4s\n"
+      "fmax v17.4s, v17.4s, v20.4s\n"
+      "fmax v18.4s, v18.4s, v20.4s\n"
+      "fmax v19.4s, v19.4s, v20.4s\n"
       "89:"  // Height 3: No activation
       "cmp x11, #0x10\n"
       "bge 98f\n"
@@ -1421,14 +1420,14 @@
       "114:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 115f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 116f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -1438,9 +1437,9 @@
       "b 116f\n"
       "115:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "116:"  // Height 4: input setup done
       "cmp x27, #0x4\n"
       "blt 119f\n"
@@ -1459,7 +1458,7 @@
       "add x26, x26, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x20]\n"
       "add x25, x25, #0x10\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -1467,85 +1466,85 @@
       "add x23, x23, #0x10\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q24, [x10, #0x30]\n"
       "cmp x27, #0x8\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v10.4s, v25.4s, v0.s[0]\n"
+      "fmla v14.4s, v25.4s, v1.s[0]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      "fmla v18.4s, v25.4s, v2.s[0]\n"
+      "fmla v22.4s, v25.4s, v3.s[0]\n"
+      "ldr q25, [x10, #0x40]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v11.4s, v24.4s, v0.s[0]\n"
+      "fmla v15.4s, v24.4s, v1.s[0]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "fmla v19.4s, v24.4s, v2.s[0]\n"
+      "fmla v23.4s, v24.4s, v3.s[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      "fmla v8.4s, v25.4s, v0.s[1]\n"
+      "fmla v12.4s, v25.4s, v1.s[1]\n"
+      "fmla v16.4s, v25.4s, v2.s[1]\n"
+      "fmla v20.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      "fmla v9.4s, v24.4s, v0.s[1]\n"
+      "fmla v13.4s, v24.4s, v1.s[1]\n"
+      "fmla v17.4s, v24.4s, v2.s[1]\n"
+      "fmla v21.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      "fmla v10.4s, v25.4s, v0.s[1]\n"
+      "fmla v14.4s, v25.4s, v1.s[1]\n"
+      "fmla v18.4s, v25.4s, v2.s[1]\n"
+      "fmla v22.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      "fmla v11.4s, v24.4s, v0.s[1]\n"
+      "fmla v15.4s, v24.4s, v1.s[1]\n"
+      "fmla v19.4s, v24.4s, v2.s[1]\n"
+      "fmla v23.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      "fmla v8.4s, v25.4s, v0.s[2]\n"
+      "fmla v12.4s, v25.4s, v1.s[2]\n"
+      "fmla v16.4s, v25.4s, v2.s[2]\n"
+      "fmla v20.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      "fmla v9.4s, v24.4s, v0.s[2]\n"
+      "fmla v13.4s, v24.4s, v1.s[2]\n"
+      "fmla v17.4s, v24.4s, v2.s[2]\n"
+      "fmla v21.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      "fmla v10.4s, v25.4s, v0.s[2]\n"
+      "fmla v14.4s, v25.4s, v1.s[2]\n"
+      "fmla v18.4s, v25.4s, v2.s[2]\n"
+      "fmla v22.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      "fmla v11.4s, v24.4s, v0.s[2]\n"
+      "fmla v15.4s, v24.4s, v1.s[2]\n"
+      "fmla v19.4s, v24.4s, v2.s[2]\n"
+      "fmla v23.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      "fmla v8.4s, v25.4s, v0.s[3]\n"
+      "fmla v12.4s, v25.4s, v1.s[3]\n"
+      "fmla v16.4s, v25.4s, v2.s[3]\n"
+      "fmla v20.4s, v25.4s, v3.s[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      "fmla v9.4s, v24.4s, v0.s[3]\n"
+      "fmla v13.4s, v24.4s, v1.s[3]\n"
+      "fmla v17.4s, v24.4s, v2.s[3]\n"
+      "fmla v21.4s, v24.4s, v3.s[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v10.4s, v25.4s, v0.s[3]\n"
+      "fmla v14.4s, v25.4s, v1.s[3]\n"
+      "fmla v18.4s, v25.4s, v2.s[3]\n"
+      "fmla v22.4s, v25.4s, v3.s[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v11.4s, v24.4s, v0.s[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v15.4s, v24.4s, v1.s[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v19.4s, v24.4s, v2.s[3]\n"
       "ldr q2, [x24, #0x0]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v23.4s, v24.4s, v3.s[3]\n"
       "ldr q3, [x23, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 117b\n"
@@ -1556,7 +1555,7 @@
       "add x25, x25, #0x10\n"
       "fmla v16.4s, v6.4s, v2.s[0]\n"
       "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x20]\n"
       "add x24, x24, #0x10\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -1564,109 +1563,109 @@
       "sub x27, x27, #0x4\n"
       "fmla v17.4s, v7.4s, v2.s[0]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q24, [x10, #0x30]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v10.4s, v25.4s, v0.s[0]\n"
+      "fmla v14.4s, v25.4s, v1.s[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      "fmla v18.4s, v25.4s, v2.s[0]\n"
+      "fmla v22.4s, v25.4s, v3.s[0]\n"
+      "ldr q25, [x10, #0x40]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "fmla v11.4s, v24.4s, v0.s[0]\n"
+      "fmla v15.4s, v24.4s, v1.s[0]\n"
+      "fmla v19.4s, v24.4s, v2.s[0]\n"
+      "fmla v23.4s, v24.4s, v3.s[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      "fmla v8.4s, v25.4s, v0.s[1]\n"
+      "fmla v12.4s, v25.4s, v1.s[1]\n"
+      "fmla v16.4s, v25.4s, v2.s[1]\n"
+      "fmla v20.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      "fmla v9.4s, v24.4s, v0.s[1]\n"
+      "fmla v13.4s, v24.4s, v1.s[1]\n"
+      "fmla v17.4s, v24.4s, v2.s[1]\n"
+      "fmla v21.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      "fmla v10.4s, v25.4s, v0.s[1]\n"
+      "fmla v14.4s, v25.4s, v1.s[1]\n"
+      "fmla v18.4s, v25.4s, v2.s[1]\n"
+      "fmla v22.4s, v25.4s, v3.s[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      "fmla v11.4s, v24.4s, v0.s[1]\n"
+      "fmla v15.4s, v24.4s, v1.s[1]\n"
+      "fmla v19.4s, v24.4s, v2.s[1]\n"
+      "fmla v23.4s, v24.4s, v3.s[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      "fmla v8.4s, v25.4s, v0.s[2]\n"
+      "fmla v12.4s, v25.4s, v1.s[2]\n"
+      "fmla v16.4s, v25.4s, v2.s[2]\n"
+      "fmla v20.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      "fmla v9.4s, v24.4s, v0.s[2]\n"
+      "fmla v13.4s, v24.4s, v1.s[2]\n"
+      "fmla v17.4s, v24.4s, v2.s[2]\n"
+      "fmla v21.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      "fmla v10.4s, v25.4s, v0.s[2]\n"
+      "fmla v14.4s, v25.4s, v1.s[2]\n"
+      "fmla v18.4s, v25.4s, v2.s[2]\n"
+      "fmla v22.4s, v25.4s, v3.s[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      "fmla v11.4s, v24.4s, v0.s[2]\n"
+      "fmla v15.4s, v24.4s, v1.s[2]\n"
+      "fmla v19.4s, v24.4s, v2.s[2]\n"
+      "fmla v23.4s, v24.4s, v3.s[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      "fmla v8.4s, v25.4s, v0.s[3]\n"
+      "fmla v12.4s, v25.4s, v1.s[3]\n"
+      "fmla v16.4s, v25.4s, v2.s[3]\n"
+      "fmla v20.4s, v25.4s, v3.s[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      "fmla v9.4s, v24.4s, v0.s[3]\n"
+      "fmla v13.4s, v24.4s, v1.s[3]\n"
+      "fmla v17.4s, v24.4s, v2.s[3]\n"
+      "fmla v21.4s, v24.4s, v3.s[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v10.4s, v25.4s, v0.s[3]\n"
+      "fmla v14.4s, v25.4s, v1.s[3]\n"
+      "fmla v18.4s, v25.4s, v2.s[3]\n"
+      "fmla v22.4s, v25.4s, v3.s[3]\n"
+      "fmla v11.4s, v24.4s, v0.s[3]\n"
+      "fmla v15.4s, v24.4s, v1.s[3]\n"
+      "fmla v19.4s, v24.4s, v2.s[3]\n"
+      "fmla v23.4s, v24.4s, v3.s[3]\n"
       "119:"  // Height 4: Multiply loop: Main loop skip
       "cbz x27, 121f\n"
       "120:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
       "sub x27, x27, #0x1\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      "fmla v8.4s, v25.4s, v29.s[0]\n"
+      "fmla v12.4s, v25.4s, v28.s[0]\n"
+      "fmla v16.4s, v25.4s, v27.s[0]\n"
+      "fmla v20.4s, v25.4s, v26.s[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      "fmla v9.4s, v24.4s, v29.s[0]\n"
+      "fmla v13.4s, v24.4s, v28.s[0]\n"
+      "fmla v17.4s, v24.4s, v27.s[0]\n"
+      "fmla v21.4s, v24.4s, v26.s[0]\n"
+      "ldr q24, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v10.4s, v25.4s, v29.s[0]\n"
+      "fmla v14.4s, v25.4s, v28.s[0]\n"
+      "fmla v18.4s, v25.4s, v27.s[0]\n"
+      "fmla v22.4s, v25.4s, v26.s[0]\n"
+      "fmla v11.4s, v24.4s, v29.s[0]\n"
+      "fmla v15.4s, v24.4s, v28.s[0]\n"
+      "fmla v19.4s, v24.4s, v27.s[0]\n"
+      "fmla v23.4s, v24.4s, v26.s[0]\n"
       "cbnz x27, 120b\n"
       "121:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1683,41 +1682,41 @@
       "prfm pstl1keep, [x23, #0x0]\n"
       "tbz %x[flags], #1, 122f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v25.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmin v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v1.4s\n"
-      "fmin v22.4s, v22.4s, v1.4s\n"
-      "fmin v23.4s, v23.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
-      "fmax v20.4s, v20.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v0.4s\n"
-      "fmax v22.4s, v22.4s, v0.4s\n"
-      "fmax v23.4s, v23.4s, v0.4s\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v25.4s\n"
+      "fmin v9.4s, v9.4s, v25.4s\n"
+      "fmin v10.4s, v10.4s, v25.4s\n"
+      "fmin v11.4s, v11.4s, v25.4s\n"
+      "fmin v12.4s, v12.4s, v25.4s\n"
+      "fmin v13.4s, v13.4s, v25.4s\n"
+      "fmin v14.4s, v14.4s, v25.4s\n"
+      "fmin v15.4s, v15.4s, v25.4s\n"
+      "fmin v16.4s, v16.4s, v25.4s\n"
+      "fmin v17.4s, v17.4s, v25.4s\n"
+      "fmin v18.4s, v18.4s, v25.4s\n"
+      "fmin v19.4s, v19.4s, v25.4s\n"
+      "fmin v20.4s, v20.4s, v25.4s\n"
+      "fmin v21.4s, v21.4s, v25.4s\n"
+      "fmin v22.4s, v22.4s, v25.4s\n"
+      "fmin v23.4s, v23.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v24.4s\n"
+      "fmax v9.4s, v9.4s, v24.4s\n"
+      "fmax v10.4s, v10.4s, v24.4s\n"
+      "fmax v11.4s, v11.4s, v24.4s\n"
+      "fmax v12.4s, v12.4s, v24.4s\n"
+      "fmax v13.4s, v13.4s, v24.4s\n"
+      "fmax v14.4s, v14.4s, v24.4s\n"
+      "fmax v15.4s, v15.4s, v24.4s\n"
+      "fmax v16.4s, v16.4s, v24.4s\n"
+      "fmax v17.4s, v17.4s, v24.4s\n"
+      "fmax v18.4s, v18.4s, v24.4s\n"
+      "fmax v19.4s, v19.4s, v24.4s\n"
+      "fmax v20.4s, v20.4s, v24.4s\n"
+      "fmax v21.4s, v21.4s, v24.4s\n"
+      "fmax v22.4s, v22.4s, v24.4s\n"
+      "fmax v23.4s, v23.4s, v24.4s\n"
       "122:"  // Height 4: No activation
       "cmp x11, #0x10\n"
       "bge 131f\n"
@@ -2042,15 +2041,15 @@
       "147:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 148f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 149f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -2061,10 +2060,10 @@
       "b 149f\n"
       "148:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "149:"  // Height 5: input setup done
       "cmp x27, #0x4\n"
       "blt 152f\n"
@@ -2087,7 +2086,7 @@
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q29, [x10, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "add x23, x23, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -2096,100 +2095,100 @@
       "cmp x27, #0x8\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q28, [x10, #0x30]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v18.4s, v29.4s, v2.s[0]\n"
+      "fmla v22.4s, v29.4s, v3.s[0]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "fmla v24.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "fmla v25.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "fmla v26.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "fmla v27.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "fmla v24.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "fmla v25.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "fmla v26.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "fmla v27.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "fmla v24.4s, v6.4s, v4.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "fmla v25.4s, v7.4s, v4.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "fmla v26.4s, v29.4s, v4.s[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v2.s[0]\n"
+      "fmla v23.4s, v28.4s, v3.s[0]\n"
+      "fmla v27.4s, v28.4s, v4.s[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      "fmla v8.4s, v29.4s, v0.s[1]\n"
+      "fmla v12.4s, v29.4s, v1.s[1]\n"
+      "fmla v16.4s, v29.4s, v2.s[1]\n"
+      "fmla v20.4s, v29.4s, v3.s[1]\n"
+      "fmla v24.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      "fmla v9.4s, v28.4s, v0.s[1]\n"
+      "fmla v13.4s, v28.4s, v1.s[1]\n"
+      "fmla v17.4s, v28.4s, v2.s[1]\n"
+      "fmla v21.4s, v28.4s, v3.s[1]\n"
+      "fmla v25.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v14.4s, v29.4s, v1.s[1]\n"
+      "fmla v18.4s, v29.4s, v2.s[1]\n"
+      "fmla v22.4s, v29.4s, v3.s[1]\n"
+      "fmla v26.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v15.4s, v28.4s, v1.s[1]\n"
+      "fmla v19.4s, v28.4s, v2.s[1]\n"
+      "fmla v23.4s, v28.4s, v3.s[1]\n"
+      "fmla v27.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v12.4s, v29.4s, v1.s[2]\n"
+      "fmla v16.4s, v29.4s, v2.s[2]\n"
+      "fmla v20.4s, v29.4s, v3.s[2]\n"
+      "fmla v24.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v13.4s, v28.4s, v1.s[2]\n"
+      "fmla v17.4s, v28.4s, v2.s[2]\n"
+      "fmla v21.4s, v28.4s, v3.s[2]\n"
+      "fmla v25.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      "fmla v10.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v18.4s, v29.4s, v2.s[2]\n"
+      "fmla v22.4s, v29.4s, v3.s[2]\n"
+      "fmla v26.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[2]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v19.4s, v28.4s, v2.s[2]\n"
+      "fmla v23.4s, v28.4s, v3.s[2]\n"
+      "fmla v27.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      "fmla v8.4s, v29.4s, v0.s[3]\n"
+      "fmla v12.4s, v29.4s, v1.s[3]\n"
+      "fmla v16.4s, v29.4s, v2.s[3]\n"
+      "fmla v20.4s, v29.4s, v3.s[3]\n"
+      "fmla v24.4s, v29.4s, v4.s[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      "fmla v9.4s, v28.4s, v0.s[3]\n"
+      "fmla v13.4s, v28.4s, v1.s[3]\n"
+      "fmla v17.4s, v28.4s, v2.s[3]\n"
+      "fmla v21.4s, v28.4s, v3.s[3]\n"
+      "fmla v25.4s, v28.4s, v4.s[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "fmla v14.4s, v29.4s, v1.s[3]\n"
+      "fmla v18.4s, v29.4s, v2.s[3]\n"
+      "fmla v22.4s, v29.4s, v3.s[3]\n"
+      "fmla v26.4s, v29.4s, v4.s[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v15.4s, v28.4s, v1.s[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v19.4s, v28.4s, v2.s[3]\n"
       "ldr q2, [x24, #0x0]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v23.4s, v28.4s, v3.s[3]\n"
       "ldr q3, [x23, #0x0]\n"
-      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "fmla v27.4s, v28.4s, v4.s[3]\n"
       "ldr q4, [x22, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 150b\n"
@@ -2203,7 +2202,7 @@
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q29, [x10, #0x20]\n"
       "fmla v9.4s, v7.4s, v0.s[0]\n"
       "add x22, x22, #0x10\n"
       "fmla v13.4s, v7.4s, v1.s[0]\n"
@@ -2212,128 +2211,128 @@
       "prfm pldl1keep, [x26, #0x80]\n"
       "fmla v21.4s, v7.4s, v3.s[0]\n"
       "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q28, [x10, #0x30]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v10.4s, v29.4s, v0.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v18.4s, v29.4s, v2.s[0]\n"
+      "fmla v22.4s, v29.4s, v3.s[0]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      "fmla v8.4s, v6.4s, v0.s[1]\n"
-      "fmla v12.4s, v6.4s, v1.s[1]\n"
-      "fmla v16.4s, v6.4s, v2.s[1]\n"
-      "fmla v20.4s, v6.4s, v3.s[1]\n"
-      "fmla v24.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      "fmla v9.4s, v7.4s, v0.s[1]\n"
-      "fmla v13.4s, v7.4s, v1.s[1]\n"
-      "fmla v17.4s, v7.4s, v2.s[1]\n"
-      "fmla v21.4s, v7.4s, v3.s[1]\n"
-      "fmla v25.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      "fmla v10.4s, v6.4s, v0.s[1]\n"
-      "fmla v14.4s, v6.4s, v1.s[1]\n"
-      "fmla v18.4s, v6.4s, v2.s[1]\n"
-      "fmla v22.4s, v6.4s, v3.s[1]\n"
-      "fmla v26.4s, v6.4s, v4.s[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      "fmla v11.4s, v7.4s, v0.s[1]\n"
-      "fmla v15.4s, v7.4s, v1.s[1]\n"
-      "fmla v19.4s, v7.4s, v2.s[1]\n"
-      "fmla v23.4s, v7.4s, v3.s[1]\n"
-      "fmla v27.4s, v7.4s, v4.s[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      "fmla v8.4s, v6.4s, v0.s[2]\n"
-      "fmla v12.4s, v6.4s, v1.s[2]\n"
-      "fmla v16.4s, v6.4s, v2.s[2]\n"
-      "fmla v20.4s, v6.4s, v3.s[2]\n"
-      "fmla v24.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      "fmla v9.4s, v7.4s, v0.s[2]\n"
-      "fmla v13.4s, v7.4s, v1.s[2]\n"
-      "fmla v17.4s, v7.4s, v2.s[2]\n"
-      "fmla v21.4s, v7.4s, v3.s[2]\n"
-      "fmla v25.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      "fmla v10.4s, v6.4s, v0.s[2]\n"
-      "fmla v14.4s, v6.4s, v1.s[2]\n"
-      "fmla v18.4s, v6.4s, v2.s[2]\n"
-      "fmla v22.4s, v6.4s, v3.s[2]\n"
-      "fmla v26.4s, v6.4s, v4.s[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      "fmla v11.4s, v7.4s, v0.s[2]\n"
-      "fmla v15.4s, v7.4s, v1.s[2]\n"
-      "fmla v19.4s, v7.4s, v2.s[2]\n"
-      "fmla v23.4s, v7.4s, v3.s[2]\n"
-      "fmla v27.4s, v7.4s, v4.s[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      "fmla v8.4s, v6.4s, v0.s[3]\n"
-      "fmla v12.4s, v6.4s, v1.s[3]\n"
-      "fmla v16.4s, v6.4s, v2.s[3]\n"
-      "fmla v20.4s, v6.4s, v3.s[3]\n"
-      "fmla v24.4s, v6.4s, v4.s[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      "fmla v9.4s, v7.4s, v0.s[3]\n"
-      "fmla v13.4s, v7.4s, v1.s[3]\n"
-      "fmla v17.4s, v7.4s, v2.s[3]\n"
-      "fmla v21.4s, v7.4s, v3.s[3]\n"
-      "fmla v25.4s, v7.4s, v4.s[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "fmla v26.4s, v29.4s, v4.s[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      "fmla v11.4s, v28.4s, v0.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v2.s[0]\n"
+      "fmla v23.4s, v28.4s, v3.s[0]\n"
+      "fmla v27.4s, v28.4s, v4.s[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      "fmla v8.4s, v29.4s, v0.s[1]\n"
+      "fmla v12.4s, v29.4s, v1.s[1]\n"
+      "fmla v16.4s, v29.4s, v2.s[1]\n"
+      "fmla v20.4s, v29.4s, v3.s[1]\n"
+      "fmla v24.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      "fmla v9.4s, v28.4s, v0.s[1]\n"
+      "fmla v13.4s, v28.4s, v1.s[1]\n"
+      "fmla v17.4s, v28.4s, v2.s[1]\n"
+      "fmla v21.4s, v28.4s, v3.s[1]\n"
+      "fmla v25.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      "fmla v10.4s, v29.4s, v0.s[1]\n"
+      "fmla v14.4s, v29.4s, v1.s[1]\n"
+      "fmla v18.4s, v29.4s, v2.s[1]\n"
+      "fmla v22.4s, v29.4s, v3.s[1]\n"
+      "fmla v26.4s, v29.4s, v4.s[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      "fmla v11.4s, v28.4s, v0.s[1]\n"
+      "fmla v15.4s, v28.4s, v1.s[1]\n"
+      "fmla v19.4s, v28.4s, v2.s[1]\n"
+      "fmla v23.4s, v28.4s, v3.s[1]\n"
+      "fmla v27.4s, v28.4s, v4.s[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      "fmla v8.4s, v29.4s, v0.s[2]\n"
+      "fmla v12.4s, v29.4s, v1.s[2]\n"
+      "fmla v16.4s, v29.4s, v2.s[2]\n"
+      "fmla v20.4s, v29.4s, v3.s[2]\n"
+      "fmla v24.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      "fmla v9.4s, v28.4s, v0.s[2]\n"
+      "fmla v13.4s, v28.4s, v1.s[2]\n"
+      "fmla v17.4s, v28.4s, v2.s[2]\n"
+      "fmla v21.4s, v28.4s, v3.s[2]\n"
+      "fmla v25.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      "fmla v10.4s, v29.4s, v0.s[2]\n"
+      "fmla v14.4s, v29.4s, v1.s[2]\n"
+      "fmla v18.4s, v29.4s, v2.s[2]\n"
+      "fmla v22.4s, v29.4s, v3.s[2]\n"
+      "fmla v26.4s, v29.4s, v4.s[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      "fmla v11.4s, v28.4s, v0.s[2]\n"
+      "fmla v15.4s, v28.4s, v1.s[2]\n"
+      "fmla v19.4s, v28.4s, v2.s[2]\n"
+      "fmla v23.4s, v28.4s, v3.s[2]\n"
+      "fmla v27.4s, v28.4s, v4.s[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      "fmla v8.4s, v29.4s, v0.s[3]\n"
+      "fmla v12.4s, v29.4s, v1.s[3]\n"
+      "fmla v16.4s, v29.4s, v2.s[3]\n"
+      "fmla v20.4s, v29.4s, v3.s[3]\n"
+      "fmla v24.4s, v29.4s, v4.s[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      "fmla v9.4s, v28.4s, v0.s[3]\n"
+      "fmla v13.4s, v28.4s, v1.s[3]\n"
+      "fmla v17.4s, v28.4s, v2.s[3]\n"
+      "fmla v21.4s, v28.4s, v3.s[3]\n"
+      "fmla v25.4s, v28.4s, v4.s[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      "fmla v10.4s, v6.4s, v0.s[3]\n"
-      "fmla v14.4s, v6.4s, v1.s[3]\n"
-      "fmla v18.4s, v6.4s, v2.s[3]\n"
-      "fmla v22.4s, v6.4s, v3.s[3]\n"
-      "fmla v26.4s, v6.4s, v4.s[3]\n"
-      "fmla v11.4s, v7.4s, v0.s[3]\n"
-      "fmla v15.4s, v7.4s, v1.s[3]\n"
-      "fmla v19.4s, v7.4s, v2.s[3]\n"
-      "fmla v23.4s, v7.4s, v3.s[3]\n"
-      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "fmla v10.4s, v29.4s, v0.s[3]\n"
+      "fmla v14.4s, v29.4s, v1.s[3]\n"
+      "fmla v18.4s, v29.4s, v2.s[3]\n"
+      "fmla v22.4s, v29.4s, v3.s[3]\n"
+      "fmla v26.4s, v29.4s, v4.s[3]\n"
+      "fmla v11.4s, v28.4s, v0.s[3]\n"
+      "fmla v15.4s, v28.4s, v1.s[3]\n"
+      "fmla v19.4s, v28.4s, v2.s[3]\n"
+      "fmla v23.4s, v28.4s, v3.s[3]\n"
+      "fmla v27.4s, v28.4s, v4.s[3]\n"
       "152:"  // Height 5: Multiply loop: Main loop skip
       "cbz x27, 154f\n"
       "153:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
       "ldr s1, [x25], #0x4\n"
       "sub x27, x27, #0x1\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s31, [x23], #0x4\n"
+      "ldr s30, [x22], #0x4\n"
+      "ldr q29, [x10, #0x0]\n"
+      "fmla v8.4s, v29.4s, v2.s[0]\n"
+      "fmla v12.4s, v29.4s, v1.s[0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      "fmla v16.4s, v29.4s, v0.s[0]\n"
+      "fmla v20.4s, v29.4s, v31.s[0]\n"
+      "fmla v24.4s, v29.4s, v30.s[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      "fmla v9.4s, v28.4s, v2.s[0]\n"
+      "fmla v13.4s, v28.4s, v1.s[0]\n"
+      "fmla v17.4s, v28.4s, v0.s[0]\n"
+      "fmla v21.4s, v28.4s, v31.s[0]\n"
+      "fmla v25.4s, v28.4s, v30.s[0]\n"
+      "ldr q28, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "fmla v10.4s, v29.4s, v2.s[0]\n"
+      "fmla v14.4s, v29.4s, v1.s[0]\n"
+      "fmla v18.4s, v29.4s, v0.s[0]\n"
+      "fmla v22.4s, v29.4s, v31.s[0]\n"
+      "fmla v26.4s, v29.4s, v30.s[0]\n"
+      "fmla v11.4s, v28.4s, v2.s[0]\n"
+      "fmla v15.4s, v28.4s, v1.s[0]\n"
+      "fmla v19.4s, v28.4s, v0.s[0]\n"
+      "fmla v23.4s, v28.4s, v31.s[0]\n"
+      "fmla v27.4s, v28.4s, v30.s[0]\n"
       "cbnz x27, 153b\n"
       "154:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2352,49 +2351,49 @@
       "prfm pstl1keep, [x22, #0x0]\n"
       "tbz %x[flags], #1, 155f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v29.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmin v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v1.4s\n"
-      "fmin v22.4s, v22.4s, v1.4s\n"
-      "fmin v23.4s, v23.4s, v1.4s\n"
-      "fmin v24.4s, v24.4s, v1.4s\n"
-      "fmin v25.4s, v25.4s, v1.4s\n"
-      "fmin v26.4s, v26.4s, v1.4s\n"
-      "fmin v27.4s, v27.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
-      "fmax v20.4s, v20.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v0.4s\n"
-      "fmax v22.4s, v22.4s, v0.4s\n"
-      "fmax v23.4s, v23.4s, v0.4s\n"
-      "fmax v24.4s, v24.4s, v0.4s\n"
-      "fmax v25.4s, v25.4s, v0.4s\n"
-      "fmax v26.4s, v26.4s, v0.4s\n"
-      "fmax v27.4s, v27.4s, v0.4s\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v29.4s\n"
+      "fmin v9.4s, v9.4s, v29.4s\n"
+      "fmin v10.4s, v10.4s, v29.4s\n"
+      "fmin v11.4s, v11.4s, v29.4s\n"
+      "fmin v12.4s, v12.4s, v29.4s\n"
+      "fmin v13.4s, v13.4s, v29.4s\n"
+      "fmin v14.4s, v14.4s, v29.4s\n"
+      "fmin v15.4s, v15.4s, v29.4s\n"
+      "fmin v16.4s, v16.4s, v29.4s\n"
+      "fmin v17.4s, v17.4s, v29.4s\n"
+      "fmin v18.4s, v18.4s, v29.4s\n"
+      "fmin v19.4s, v19.4s, v29.4s\n"
+      "fmin v20.4s, v20.4s, v29.4s\n"
+      "fmin v21.4s, v21.4s, v29.4s\n"
+      "fmin v22.4s, v22.4s, v29.4s\n"
+      "fmin v23.4s, v23.4s, v29.4s\n"
+      "fmin v24.4s, v24.4s, v29.4s\n"
+      "fmin v25.4s, v25.4s, v29.4s\n"
+      "fmin v26.4s, v26.4s, v29.4s\n"
+      "fmin v27.4s, v27.4s, v29.4s\n"
+      "fmax v8.4s, v8.4s, v28.4s\n"
+      "fmax v9.4s, v9.4s, v28.4s\n"
+      "fmax v10.4s, v10.4s, v28.4s\n"
+      "fmax v11.4s, v11.4s, v28.4s\n"
+      "fmax v12.4s, v12.4s, v28.4s\n"
+      "fmax v13.4s, v13.4s, v28.4s\n"
+      "fmax v14.4s, v14.4s, v28.4s\n"
+      "fmax v15.4s, v15.4s, v28.4s\n"
+      "fmax v16.4s, v16.4s, v28.4s\n"
+      "fmax v17.4s, v17.4s, v28.4s\n"
+      "fmax v18.4s, v18.4s, v28.4s\n"
+      "fmax v19.4s, v19.4s, v28.4s\n"
+      "fmax v20.4s, v20.4s, v28.4s\n"
+      "fmax v21.4s, v21.4s, v28.4s\n"
+      "fmax v22.4s, v22.4s, v28.4s\n"
+      "fmax v23.4s, v23.4s, v28.4s\n"
+      "fmax v24.4s, v24.4s, v28.4s\n"
+      "fmax v25.4s, v25.4s, v28.4s\n"
+      "fmax v26.4s, v26.4s, v28.4s\n"
+      "fmax v27.4s, v27.4s, v28.4s\n"
       "155:"  // Height 5: No activation
       "cmp x11, #0x10\n"
       "bge 164f\n"
@@ -2771,16 +2770,16 @@
       "180:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 181f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 182f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -2792,11 +2791,11 @@
       "b 182f\n"
       "181:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "182:"  // Height 6: input setup done
       "cmp x27, #0x4\n"
       "blt 185f\n"
@@ -3073,42 +3072,42 @@
       "185:"  // Height 6: Multiply loop: Main loop skip
       "cbz x27, 187f\n"
       "186:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s7, [x26], #0x4\n"
+      "ldr s6, [x25], #0x4\n"
       "sub x27, x27, #0x1\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr s5, [x21], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      "fmla v8.4s, v6.4s, v0.s[0]\n"
-      "fmla v12.4s, v6.4s, v1.s[0]\n"
-      "fmla v16.4s, v6.4s, v2.s[0]\n"
-      "fmla v20.4s, v6.4s, v3.s[0]\n"
-      "fmla v24.4s, v6.4s, v4.s[0]\n"
-      "fmla v28.4s, v6.4s, v5.s[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "fmla v9.4s, v7.4s, v0.s[0]\n"
-      "fmla v13.4s, v7.4s, v1.s[0]\n"
-      "fmla v17.4s, v7.4s, v2.s[0]\n"
-      "fmla v21.4s, v7.4s, v3.s[0]\n"
-      "fmla v25.4s, v7.4s, v4.s[0]\n"
-      "fmla v29.4s, v7.4s, v5.s[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s3, [x22], #0x4\n"
+      "ldr s2, [x21], #0x4\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      "fmla v8.4s, v1.4s, v7.s[0]\n"
+      "fmla v12.4s, v1.4s, v6.s[0]\n"
+      "fmla v16.4s, v1.4s, v5.s[0]\n"
+      "fmla v20.4s, v1.4s, v4.s[0]\n"
+      "fmla v24.4s, v1.4s, v3.s[0]\n"
+      "fmla v28.4s, v1.4s, v2.s[0]\n"
+      "ldr q1, [x10, #0x20]\n"
+      "fmla v9.4s, v0.4s, v7.s[0]\n"
+      "fmla v13.4s, v0.4s, v6.s[0]\n"
+      "fmla v17.4s, v0.4s, v5.s[0]\n"
+      "fmla v21.4s, v0.4s, v4.s[0]\n"
+      "fmla v25.4s, v0.4s, v3.s[0]\n"
+      "fmla v29.4s, v0.4s, v2.s[0]\n"
+      "ldr q0, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      "fmla v10.4s, v6.4s, v0.s[0]\n"
-      "fmla v14.4s, v6.4s, v1.s[0]\n"
-      "fmla v18.4s, v6.4s, v2.s[0]\n"
-      "fmla v22.4s, v6.4s, v3.s[0]\n"
-      "fmla v26.4s, v6.4s, v4.s[0]\n"
-      "fmla v30.4s, v6.4s, v5.s[0]\n"
-      "fmla v11.4s, v7.4s, v0.s[0]\n"
-      "fmla v15.4s, v7.4s, v1.s[0]\n"
-      "fmla v19.4s, v7.4s, v2.s[0]\n"
-      "fmla v23.4s, v7.4s, v3.s[0]\n"
-      "fmla v27.4s, v7.4s, v4.s[0]\n"
-      "fmla v31.4s, v7.4s, v5.s[0]\n"
+      "fmla v10.4s, v1.4s, v7.s[0]\n"
+      "fmla v14.4s, v1.4s, v6.s[0]\n"
+      "fmla v18.4s, v1.4s, v5.s[0]\n"
+      "fmla v22.4s, v1.4s, v4.s[0]\n"
+      "fmla v26.4s, v1.4s, v3.s[0]\n"
+      "fmla v30.4s, v1.4s, v2.s[0]\n"
+      "fmla v11.4s, v0.4s, v7.s[0]\n"
+      "fmla v15.4s, v0.4s, v6.s[0]\n"
+      "fmla v19.4s, v0.4s, v5.s[0]\n"
+      "fmla v23.4s, v0.4s, v4.s[0]\n"
+      "fmla v27.4s, v0.4s, v3.s[0]\n"
+      "fmla v31.4s, v0.4s, v2.s[0]\n"
       "cbnz x27, 186b\n"
       "187:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -3350,7 +3349,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "200:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
index 4fad58a..3ec0239 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 
 #define ARGLIST  \
@@ -90,5 +90,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp
index 67e0c1e..2368653 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, 2023 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -92,7 +92,6 @@
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x8\n"
       "bge 148f\n"
@@ -105,563 +104,563 @@
       "cmp %x[M], #0x2\n"
       "bgt 43f\n"
       "beq 22f\n"
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x15, %x[bias]\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
       "2:"  // Height 1: Column loop
-      "cbz x15, 3f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 3f\n"
+      "ldr q24, [x3, #0x0]\n"
+      "add x3, x3, #0x10\n"
       "b 8f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 7f\n"
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 6f\n"
-      "tbz x17, #1, 4f\n"
-      "ldr d24, [x14], #0x8\n"
-      "mov x8, #0x8\n"
-      "tbz x17, #0, 5f\n"
-      "ld1 { v24.s }[2], [x14]\n"
+      "tbz x4, #1, 4f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "tbz x4, #0, 5f\n"
+      "ld1 { v24.s }[2], [x6]\n"
       "b 5f\n"
       "4:"  // Height 1: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x8, #0x0\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
       "5:"  // Height 1: Partial accumulate: Done
-      "sub x14, x14, x8\n"
+      "sub x6, x6, x26\n"
       "b 8f\n"
       "6:"  // Height 1: full accumulate
-      "ldr q24, [x14, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
       "b 8f\n"
       "7:"  // Height 1: no accumulate
       "movi v24.16b, #0x0\n"
       "8:"  // Height 1: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "9:"  // Height 1: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 10f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x8, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "cbnz x13, 11f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x8, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "cbnz x7, 11f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
       "b 11f\n"
       "10:"  // Height 1: setup direct input
-      "mov x11, %x[input_ptr]\n"
+      "mov x17, %x[input_ptr]\n"
       "11:"  // Height 1: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 14f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q8, [x16, #0x0]\n"
-      "cmp x12, #0x8\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 13f\n"
       "12:"  // Height 1: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
-      "ldr x8, [x16, #0x18]\n"
-      "add x11, x11, #0x10\n"
-      "ldr d10, [x16, #0x20]\n"
-      "sub x12, x12, #0x4\n"
-      "ldr x21, [x16, #0x28]\n"
-      "cmp x12, #0x8\n"
-      "mov v9.d[1], x8\n"
-      "ldr d11, [x16, #0x30]\n"
-      "ldr x8, [x16, #0x38]\n"
-      "add x16, x16, #0x40\n"
+      "add x17, x17, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "mov v10.d[1], x21\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
-      "mov v11.d[1], x8\n"
-      "ldr d8, [x16, #0x0]\n"
-      "ldr x26, [x16, #0x8]\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "ldr x10, [x11, #0x8]\n"
-      "mov v8.d[1], x26\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
-      "mov v0.d[1], x10\n"
+      "ldr d0, [x17, #0x0]\n"
+      "sub x8, x8, #0x4\n"
+      "ldr d10, [x5, #0x20]\n"
+      "cmp x8, #0x8\n"
+      "ldr d11, [x5, #0x30]\n"
+      "ldr x26, [x5, #0x8]\n"
+      "mov v8.d[1], x26\n"
+      "ldr x26, [x5, #0x18]\n"
+      "mov v9.d[1], x26\n"
+      "ldr x26, [x17, #0x8]\n"
+      "mov v0.d[1], x26\n"
+      "ldr x26, [x5, #0x28]\n"
+      "mov v10.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "mov v11.d[1], x26\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "bge 12b\n"
       "13:"  // Height 1: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
-      "ldr q10, [x16, #0x20]\n"
-      "sub x12, x12, #0x4\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x11, x11, #0x10\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
-      "add x16, x16, #0x40\n"
+      "add x17, x17, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "sub x8, x8, #0x4\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "add x5, x5, #0x40\n"
       "14:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x12, 16f\n"
+      "cbz x8, 16f\n"
       "15:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "cbnz x12, 15b\n"
+      "ldr s17, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v17.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "cbnz x8, 15b\n"
       "16:"  // Height 1: Multiply loop: No odd multiplies
-      "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x8\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 9b\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
       "tbz %x[flags], #1, 17f\n"
-      "add x8, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x8]\n"
-      "add x8, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x8]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
       "17:"  // Height 1: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 20f\n"
-      "tbz x17, #1, 18f\n"
-      "str d24, [x14], #0x8\n"
-      "tbz x17, #0, 19f\n"
-      "st1 { v24.s }[2], [x14]\n"
+      "tbz x4, #1, 18f\n"
+      "str d24, [x6], #0x8\n"
+      "tbz x4, #0, 19f\n"
+      "st1 { v24.s }[2], [x6]\n"
       "b 19f\n"
       "18:"  // Height 1: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
       "19:"  // Height 1: Partial direct writeback: Done
       "b 21f\n"
       "20:"  // Height 1: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
       "21:"  // Height 1: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 2b\n"
       "b 170f\n"
       "22:"  // Height 2
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[bias]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
       "23:"  // Height 2: Column loop
-      "cbz x15, 24f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 24f\n"
+      "ldr q24, [x3, #0x0]\n"
       "mov v25.16b, v24.16b\n"
+      "add x3, x3, #0x10\n"
       "b 29f\n"
       "24:"  // Height 2: no bias
       "tbz %x[flags], #0, 28f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x4\n"
-      "add x27, x14, x8, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x4, #0x4\n"
+      "add x13, x6, x26, LSL #2\n"
       "bge 27f\n"
-      "tbz x17, #1, 25f\n"
-      "ldr d24, [x14], #0x8\n"
-      "ldr d25, [x27], #0x8\n"
-      "mov x8, #0x8\n"
-      "tbz x17, #0, 26f\n"
-      "ld1 { v24.s }[2], [x14]\n"
-      "ld1 { v25.s }[2], [x27]\n"
+      "tbz x4, #1, 25f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "tbz x4, #0, 26f\n"
+      "ld1 { v24.s }[2], [x6]\n"
+      "ld1 { v25.s }[2], [x13]\n"
       "b 26f\n"
       "25:"  // Height 2: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x8, #0x0\n"
-      "ldr s25, [x27, #0x0]\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
+      "ldr s25, [x13, #0x0]\n"
       "26:"  // Height 2: Partial accumulate: Done
-      "sub x14, x14, x8\n"
+      "sub x6, x6, x26\n"
       "b 29f\n"
       "27:"  // Height 2: full accumulate
-      "ldr q24, [x14, #0x0]\n"
-      "ldr q25, [x27, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
       "b 29f\n"
       "28:"  // Height 2: no accumulate
       "movi v24.16b, #0x0\n"
       "movi v25.16b, #0x0\n"
       "29:"  // Height 2: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "30:"  // Height 2: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 31f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x8, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "ldr x9, [x20, #0x8]\n"
-      "cbnz x13, 32f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x8, LSL #2\n"
-      "add x9, x9, x8, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "ldr x16, [x26, #0x8]\n"
+      "cbnz x7, 32f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
+      "add x16, x16, x26, LSL #2\n"
       "b 32f\n"
       "31:"  // Height 2: setup direct input
-      "mov x11, %x[input_ptr]\n"
-      "add x9, x11, x8, LSL #2\n"
+      "mov x17, %x[input_ptr]\n"
+      "add x16, x17, x27, LSL #2\n"
       "32:"  // Height 2: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 35f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q1, [x9, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr q8, [x16, #0x0]\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q1, [x16, #0x0]\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 34f\n"
       "33:"  // Height 2: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr x8, [x16, #0x18]\n"
-      "ldr d10, [x16, #0x20]\n"
-      "add x11, x11, #0x10\n"
-      "ldr x21, [x16, #0x28]\n"
-      "add x9, x9, #0x10\n"
-      "mov v9.d[1], x8\n"
-      "ldr d11, [x16, #0x30]\n"
-      "ldr x8, [x16, #0x38]\n"
-      "sub x12, x12, #0x4\n"
+      "add x16, x16, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "mov v10.d[1], x21\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
-      "mov v11.d[1], x8\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "ldr x10, [x11, #0x8]\n"
-      "cmp x12, #0x8\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "ldr x28, [x9, #0x8]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "add x16, x16, #0x40\n"
-      "ldr d8, [x16, #0x0]\n"
-      "ldr x26, [x16, #0x8]\n"
+      "ldr d10, [x5, #0x20]\n"
+      "ldr x27, [x5, #0x8]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
+      "ldr d0, [x17, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      "mov v8.d[1], x26\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
+      "ldr d1, [x16, #0x0]\n"
+      "sub x8, x8, #0x4\n"
+      "ldr d11, [x5, #0x30]\n"
+      "cmp x8, #0x8\n"
+      "ldr x26, [x5, #0x18]\n"
+      "mov v8.d[1], x27\n"
+      "ldr x27, [x17, #0x8]\n"
+      "mov v9.d[1], x26\n"
+      "ldr x26, [x16, #0x8]\n"
+      "mov v0.d[1], x27\n"
+      "ldr x27, [x5, #0x28]\n"
+      "mov v1.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "mov v10.d[1], x27\n"
+      "mov v11.d[1], x26\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "bge 33b\n"
       "34:"  // Height 2: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x16, #0x20]\n"
-      "ldr q11, [x16, #0x30]\n"
-      "sub x12, x12, #0x4\n"
-      "add x11, x11, #0x10\n"
-      "add x9, x9, #0x10\n"
+      "add x16, x16, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "sub x8, x8, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "add x16, x16, #0x40\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "add x5, x5, #0x40\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
       "35:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x12, 37f\n"
+      "cbz x8, 37f\n"
       "36:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "cbnz x12, 36b\n"
+      "ldr s18, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr s17, [x16], #0x4\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v18.s[0]\n"
+      "fmla v25.4s, v16.4s, v17.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "cbnz x8, 36b\n"
       "37:"  // Height 2: Multiply loop: No odd multiplies
-      "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x8\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 30b\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "add x27, x14, x8, LSL #2\n"
-      "prfm pstl1keep, [x27, #0x0]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
       "tbz %x[flags], #1, 38f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x8, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v17.4s }, [x20]\n"
-      "ld1r { v16.4s }, [x8]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
       "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
       "38:"  // Height 2: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 41f\n"
-      "tbz x17, #1, 39f\n"
-      "str d24, [x14], #0x8\n"
-      "str d25, [x27], #0x8\n"
-      "tbz x17, #0, 40f\n"
-      "st1 { v24.s }[2], [x14]\n"
-      "st1 { v25.s }[2], [x27]\n"
+      "tbz x4, #1, 39f\n"
+      "str d24, [x6], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "tbz x4, #0, 40f\n"
+      "st1 { v24.s }[2], [x6]\n"
+      "st1 { v25.s }[2], [x13]\n"
       "b 40f\n"
       "39:"  // Height 2: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
-      "str s25, [x27, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
       "40:"  // Height 2: Partial direct writeback: Done
       "b 42f\n"
       "41:"  // Height 2: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q25, [x27, #0x0]\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
+      "str q25, [x13, #0x0]\n"
       "42:"  // Height 2: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 23b\n"
       "b 170f\n"
       "43:"  // Height 3
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[bias]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
       "44:"  // Height 3: Column loop
-      "cbz x15, 45f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 45f\n"
+      "ldr q24, [x3, #0x0]\n"
       "mov v25.16b, v24.16b\n"
       "mov v26.16b, v24.16b\n"
+      "add x3, x3, #0x10\n"
       "b 50f\n"
       "45:"  // Height 3: no bias
       "tbz %x[flags], #0, 49f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x4\n"
-      "add x27, x14, x8, LSL #2\n"
-      "add x26, x27, x8, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "cmp x4, #0x4\n"
+      "add x12, x13, x26, LSL #2\n"
       "bge 48f\n"
-      "tbz x17, #1, 46f\n"
-      "ldr d24, [x14], #0x8\n"
-      "ldr d25, [x27], #0x8\n"
-      "mov x8, #0x8\n"
-      "ldr d26, [x26], #0x8\n"
-      "tbz x17, #0, 47f\n"
-      "ld1 { v24.s }[2], [x14]\n"
-      "ld1 { v25.s }[2], [x27]\n"
-      "ld1 { v26.s }[2], [x26]\n"
+      "tbz x4, #1, 46f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x12], #0x8\n"
+      "tbz x4, #0, 47f\n"
+      "ld1 { v24.s }[2], [x6]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x12]\n"
       "b 47f\n"
       "46:"  // Height 3: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x8, #0x0\n"
-      "ldr s25, [x27, #0x0]\n"
-      "ldr s26, [x26, #0x0]\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x12, #0x0]\n"
       "47:"  // Height 3: Partial accumulate: Done
-      "sub x14, x14, x8\n"
+      "sub x6, x6, x26\n"
       "b 50f\n"
       "48:"  // Height 3: full accumulate
-      "ldr q24, [x14, #0x0]\n"
-      "ldr q25, [x27, #0x0]\n"
-      "ldr q26, [x26, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x12, #0x0]\n"
       "b 50f\n"
       "49:"  // Height 3: no accumulate
       "movi v24.16b, #0x0\n"
       "movi v25.16b, #0x0\n"
       "movi v26.16b, #0x0\n"
       "50:"  // Height 3: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "51:"  // Height 3: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 52f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x8, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "ldr x9, [x20, #0x8]\n"
-      "ldr x27, [x20, #0x10]\n"
-      "cbnz x13, 53f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x8, LSL #2\n"
-      "add x9, x9, x8, LSL #2\n"
-      "add x27, x27, x8, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "ldr x16, [x26, #0x8]\n"
+      "ldr x15, [x26, #0x10]\n"
+      "cbnz x7, 53f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
+      "add x16, x16, x26, LSL #2\n"
+      "add x15, x15, x26, LSL #2\n"
       "b 53f\n"
       "52:"  // Height 3: setup direct input
-      "mov x11, %x[input_ptr]\n"
-      "add x9, x11, x8, LSL #2\n"
-      "add x27, x9, x8, LSL #2\n"
+      "mov x17, %x[input_ptr]\n"
+      "add x16, x17, x27, LSL #2\n"
+      "add x15, x16, x27, LSL #2\n"
       "53:"  // Height 3: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 56f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q1, [x9, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q8, [x16, #0x0]\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q1, [x16, #0x0]\n"
+      "ldr q2, [x15, #0x0]\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 55f\n"
       "54:"  // Height 3: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr x8, [x16, #0x18]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr d10, [x16, #0x20]\n"
-      "ldr x21, [x16, #0x28]\n"
-      "add x11, x11, #0x10\n"
-      "mov v9.d[1], x8\n"
-      "ldr d11, [x16, #0x30]\n"
-      "ldr x8, [x16, #0x38]\n"
-      "add x9, x9, #0x10\n"
+      "add x15, x15, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "mov v10.d[1], x21\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "mov v11.d[1], x8\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "add x27, x27, #0x10\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "ldr x28, [x5, #0x8]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "ldr x10, [x11, #0x8]\n"
+      "ldr x27, [x5, #0x18]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "ldr x28, [x9, #0x8]\n"
-      "ldr x26, [x27, #0x8]\n"
-      "sub x12, x12, #0x4\n"
+      "ldr d10, [x5, #0x20]\n"
+      "ldr x26, [x5, #0x28]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
+      "ldr d0, [x17, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d1, [x16, #0x0]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
-      "mov v0.d[1], x10\n"
-      "cmp x12, #0x8\n"
+      "ldr d2, [x15, #0x0]\n"
+      "sub x8, x8, #0x4\n"
+      "ldr d11, [x5, #0x30]\n"
+      "cmp x8, #0x8\n"
+      "ldr x9, [x17, #0x8]\n"
+      "mov v8.d[1], x28\n"
+      "ldr x28, [x16, #0x8]\n"
+      "mov v9.d[1], x27\n"
+      "ldr x27, [x15, #0x8]\n"
+      "mov v10.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "mov v0.d[1], x9\n"
       "mov v1.d[1], x28\n"
-      "add x16, x16, #0x40\n"
-      "mov v2.d[1], x26\n"
-      "ldr d8, [x16, #0x0]\n"
-      "ldr x26, [x16, #0x8]\n"
-      "mov v8.d[1], x26\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
+      "mov v2.d[1], x27\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
+      "mov v11.d[1], x26\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "bge 54b\n"
       "55:"  // Height 3: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x16, #0x30]\n"
-      "sub x12, x12, #0x4\n"
-      "add x11, x11, #0x10\n"
+      "add x15, x15, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "sub x8, x8, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "add x9, x9, #0x10\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "add x27, x27, #0x10\n"
-      "add x16, x16, #0x40\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "add x5, x5, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
       "56:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x12, 58f\n"
+      "cbz x8, 58f\n"
       "57:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "cbnz x12, 57b\n"
+      "ldr s19, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr s18, [x16], #0x4\n"
+      "ldr s17, [x15], #0x4\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v19.s[0]\n"
+      "fmla v25.4s, v16.4s, v18.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "fmla v26.4s, v16.4s, v17.s[0]\n"
+      "cbnz x8, 57b\n"
       "58:"  // Height 3: Multiply loop: No odd multiplies
-      "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x8\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 51b\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "add x27, x14, x8, LSL #2\n"
-      "prfm pstl1keep, [x27, #0x0]\n"
-      "add x26, x27, x8, LSL #2\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x12, #0x0]\n"
       "tbz %x[flags], #1, 59f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x8, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v17.4s }, [x20]\n"
-      "ld1r { v16.4s }, [x8]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
       "fmin v25.4s, v25.4s, v16.4s\n"
       "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
       "59:"  // Height 3: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 62f\n"
-      "tbz x17, #1, 60f\n"
-      "str d24, [x14], #0x8\n"
-      "str d25, [x27], #0x8\n"
-      "str d26, [x26], #0x8\n"
-      "tbz x17, #0, 61f\n"
-      "st1 { v24.s }[2], [x14]\n"
-      "st1 { v25.s }[2], [x27]\n"
-      "st1 { v26.s }[2], [x26]\n"
+      "tbz x4, #1, 60f\n"
+      "str d24, [x6], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x12], #0x8\n"
+      "tbz x4, #0, 61f\n"
+      "st1 { v24.s }[2], [x6]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x12]\n"
       "b 61f\n"
       "60:"  // Height 3: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
-      "str s25, [x27, #0x0]\n"
-      "str s26, [x26, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x12, #0x0]\n"
       "61:"  // Height 3: Partial direct writeback: Done
       "b 63f\n"
       "62:"  // Height 3: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q25, [x27, #0x0]\n"
-      "str q26, [x26, #0x0]\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x12, #0x0]\n"
       "63:"  // Height 3: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 44b\n"
       "b 170f\n"
       "64:"  // Height 4
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[bias]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
       "65:"  // Height 4: Column loop
-      "cbz x15, 66f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 66f\n"
+      "ldr q24, [x3, #0x0]\n"
       "mov v25.16b, v24.16b\n"
       "mov v26.16b, v24.16b\n"
+      "add x3, x3, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "b 71f\n"
       "66:"  // Height 4: no bias
       "tbz %x[flags], #0, 70f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x4\n"
-      "add x27, x14, x8, LSL #2\n"
-      "add x26, x27, x8, LSL #2\n"
-      "add x25, x26, x8, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "cmp x4, #0x4\n"
+      "add x11, x12, x26, LSL #2\n"
       "bge 69f\n"
-      "tbz x17, #1, 67f\n"
-      "ldr d24, [x14], #0x8\n"
-      "ldr d25, [x27], #0x8\n"
-      "mov x8, #0x8\n"
-      "ldr d26, [x26], #0x8\n"
-      "ldr d27, [x25], #0x8\n"
-      "tbz x17, #0, 68f\n"
-      "ld1 { v24.s }[2], [x14]\n"
-      "ld1 { v25.s }[2], [x27]\n"
-      "ld1 { v26.s }[2], [x26]\n"
-      "ld1 { v27.s }[2], [x25]\n"
+      "tbz x4, #1, 67f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x12], #0x8\n"
+      "ldr d27, [x11], #0x8\n"
+      "tbz x4, #0, 68f\n"
+      "ld1 { v24.s }[2], [x6]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x12]\n"
+      "ld1 { v27.s }[2], [x11]\n"
       "b 68f\n"
       "67:"  // Height 4: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x8, #0x0\n"
-      "ldr s25, [x27, #0x0]\n"
-      "ldr s26, [x26, #0x0]\n"
-      "ldr s27, [x25, #0x0]\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x12, #0x0]\n"
+      "ldr s27, [x11, #0x0]\n"
       "68:"  // Height 4: Partial accumulate: Done
-      "sub x14, x14, x8\n"
+      "sub x6, x6, x26\n"
       "b 71f\n"
       "69:"  // Height 4: full accumulate
-      "ldr q24, [x14, #0x0]\n"
-      "ldr q25, [x27, #0x0]\n"
-      "ldr q26, [x26, #0x0]\n"
-      "ldr q27, [x25, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q27, [x11, #0x0]\n"
       "b 71f\n"
       "70:"  // Height 4: no accumulate
       "movi v24.16b, #0x0\n"
@@ -669,248 +668,248 @@
       "movi v26.16b, #0x0\n"
       "movi v27.16b, #0x0\n"
       "71:"  // Height 4: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "72:"  // Height 4: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 73f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x8, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "ldr x9, [x20, #0x8]\n"
-      "ldr x27, [x20, #0x10]\n"
-      "ldr x25, [x20, #0x18]\n"
-      "cbnz x13, 74f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x8, LSL #2\n"
-      "add x9, x9, x8, LSL #2\n"
-      "add x27, x27, x8, LSL #2\n"
-      "add x25, x25, x8, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "ldr x16, [x26, #0x8]\n"
+      "ldr x15, [x26, #0x10]\n"
+      "ldr x14, [x26, #0x18]\n"
+      "cbnz x7, 74f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
+      "add x16, x16, x26, LSL #2\n"
+      "add x15, x15, x26, LSL #2\n"
+      "add x14, x14, x26, LSL #2\n"
       "b 74f\n"
       "73:"  // Height 4: setup direct input
-      "mov x11, %x[input_ptr]\n"
-      "add x9, x11, x8, LSL #2\n"
-      "add x27, x9, x8, LSL #2\n"
-      "add x25, x27, x8, LSL #2\n"
+      "mov x17, %x[input_ptr]\n"
+      "add x16, x17, x27, LSL #2\n"
+      "add x15, x16, x27, LSL #2\n"
+      "add x14, x15, x27, LSL #2\n"
       "74:"  // Height 4: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 77f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q1, [x9, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q8, [x16, #0x0]\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q1, [x16, #0x0]\n"
+      "ldr q2, [x15, #0x0]\n"
+      "ldr q3, [x14, #0x0]\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 76f\n"
       "75:"  // Height 4: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr x8, [x16, #0x18]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr d10, [x16, #0x20]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "ldr x21, [x16, #0x28]\n"
-      "mov v9.d[1], x8\n"
-      "ldr d11, [x16, #0x30]\n"
-      "ldr x8, [x16, #0x38]\n"
-      "add x11, x11, #0x10\n"
+      "add x14, x14, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "mov v10.d[1], x21\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "mov v11.d[1], x8\n"
+      "ldr x27, [x5, #0x8]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "ldr x10, [x11, #0x8]\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x9, x9, #0x10\n"
+      "ldr x26, [x5, #0x18]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "ldr x11, [x5, #0x28]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "ldr x28, [x9, #0x8]\n"
+      "ldr x10, [x17, #0x8]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "add x27, x27, #0x10\n"
+      "ldr d10, [x5, #0x20]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "ldr d0, [x17, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
+      "ldr d1, [x16, #0x0]\n"
+      "ldr x9, [x16, #0x8]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d2, [x15, #0x0]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "ldr d3, [x14, #0x0]\n"
+      "sub x8, x8, #0x4\n"
+      "ldr d11, [x5, #0x30]\n"
+      "cmp x8, #0x8\n"
+      "ldr x28, [x15, #0x8]\n"
+      "mov v8.d[1], x27\n"
+      "ldr x27, [x14, #0x8]\n"
+      "mov v9.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "mov v10.d[1], x11\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "mov v0.d[1], x10\n"
-      "ldr x26, [x27, #0x8]\n"
-      "mov v1.d[1], x28\n"
-      "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "sub x12, x12, #0x4\n"
-      "mov v2.d[1], x26\n"
-      "ldr d3, [x25, #0x0]\n"
-      "ldr x8, [x25, #0x8]\n"
-      "cmp x12, #0x8\n"
-      "add x16, x16, #0x40\n"
-      "ldr d8, [x16, #0x0]\n"
-      "mov v3.d[1], x8\n"
-      "ldr x26, [x16, #0x8]\n"
-      "mov v8.d[1], x26\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
+      "mov v1.d[1], x9\n"
+      "mov v2.d[1], x28\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
+      "mov v3.d[1], x27\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "mov v11.d[1], x26\n"
       "bge 75b\n"
       "76:"  // Height 4: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x16, #0x30]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "sub x12, x12, #0x4\n"
+      "add x14, x14, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "add x11, x11, #0x10\n"
+      "sub x8, x8, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "add x9, x9, #0x10\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x27, x27, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "add x5, x5, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "add x25, x25, #0x10\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "add x16, x16, #0x40\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
       "77:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x12, 79f\n"
+      "cbz x8, 79f\n"
       "78:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "cbnz x12, 78b\n"
+      "ldr s20, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr s19, [x16], #0x4\n"
+      "ldr s18, [x15], #0x4\n"
+      "ldr s17, [x14], #0x4\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v20.s[0]\n"
+      "fmla v25.4s, v16.4s, v19.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "fmla v26.4s, v16.4s, v18.s[0]\n"
+      "fmla v27.4s, v16.4s, v17.s[0]\n"
+      "cbnz x8, 78b\n"
       "79:"  // Height 4: Multiply loop: No odd multiplies
-      "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x8\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 72b\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "add x27, x14, x8, LSL #2\n"
-      "prfm pstl1keep, [x27, #0x0]\n"
-      "add x26, x27, x8, LSL #2\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x8, LSL #2\n"
-      "prfm pstl1keep, [x25, #0x0]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x12, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
       "tbz %x[flags], #1, 80f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x8, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v17.4s }, [x20]\n"
-      "ld1r { v16.4s }, [x8]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
       "fmin v25.4s, v25.4s, v16.4s\n"
       "fmin v26.4s, v26.4s, v16.4s\n"
       "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
       "80:"  // Height 4: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 83f\n"
-      "tbz x17, #1, 81f\n"
-      "str d24, [x14], #0x8\n"
-      "str d25, [x27], #0x8\n"
-      "str d26, [x26], #0x8\n"
-      "str d27, [x25], #0x8\n"
-      "tbz x17, #0, 82f\n"
-      "st1 { v24.s }[2], [x14]\n"
-      "st1 { v25.s }[2], [x27]\n"
-      "st1 { v26.s }[2], [x26]\n"
-      "st1 { v27.s }[2], [x25]\n"
+      "tbz x4, #1, 81f\n"
+      "str d24, [x6], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x12], #0x8\n"
+      "str d27, [x11], #0x8\n"
+      "tbz x4, #0, 82f\n"
+      "st1 { v24.s }[2], [x6]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x12]\n"
+      "st1 { v27.s }[2], [x11]\n"
       "b 82f\n"
       "81:"  // Height 4: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
-      "str s25, [x27, #0x0]\n"
-      "str s26, [x26, #0x0]\n"
-      "str s27, [x25, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x12, #0x0]\n"
+      "str s27, [x11, #0x0]\n"
       "82:"  // Height 4: Partial direct writeback: Done
       "b 84f\n"
       "83:"  // Height 4: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q25, [x27, #0x0]\n"
-      "str q26, [x26, #0x0]\n"
-      "str q27, [x25, #0x0]\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x12, #0x0]\n"
+      "str q27, [x11, #0x0]\n"
       "84:"  // Height 4: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 65b\n"
       "b 170f\n"
       "85:"  // Height 5
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[bias]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
       "86:"  // Height 5: Column loop
-      "cbz x15, 87f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 87f\n"
+      "ldr q24, [x3, #0x0]\n"
       "mov v25.16b, v24.16b\n"
       "mov v26.16b, v24.16b\n"
+      "add x3, x3, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "mov v28.16b, v24.16b\n"
       "b 92f\n"
       "87:"  // Height 5: no bias
       "tbz %x[flags], #0, 91f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x4\n"
-      "add x27, x14, x8, LSL #2\n"
-      "add x26, x27, x8, LSL #2\n"
-      "add x25, x26, x8, LSL #2\n"
-      "add x24, x25, x8, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "cmp x4, #0x4\n"
+      "add x10, x11, x26, LSL #2\n"
       "bge 90f\n"
-      "tbz x17, #1, 88f\n"
-      "ldr d24, [x14], #0x8\n"
-      "ldr d25, [x27], #0x8\n"
-      "mov x8, #0x8\n"
-      "ldr d26, [x26], #0x8\n"
-      "ldr d27, [x25], #0x8\n"
-      "ldr d28, [x24], #0x8\n"
-      "tbz x17, #0, 89f\n"
-      "ld1 { v24.s }[2], [x14]\n"
-      "ld1 { v25.s }[2], [x27]\n"
-      "ld1 { v26.s }[2], [x26]\n"
-      "ld1 { v27.s }[2], [x25]\n"
-      "ld1 { v28.s }[2], [x24]\n"
+      "tbz x4, #1, 88f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x12], #0x8\n"
+      "ldr d27, [x11], #0x8\n"
+      "ldr d28, [x10], #0x8\n"
+      "tbz x4, #0, 89f\n"
+      "ld1 { v24.s }[2], [x6]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x12]\n"
+      "ld1 { v27.s }[2], [x11]\n"
+      "ld1 { v28.s }[2], [x10]\n"
       "b 89f\n"
       "88:"  // Height 5: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x8, #0x0\n"
-      "ldr s25, [x27, #0x0]\n"
-      "ldr s26, [x26, #0x0]\n"
-      "ldr s27, [x25, #0x0]\n"
-      "ldr s28, [x24, #0x0]\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x12, #0x0]\n"
+      "ldr s27, [x11, #0x0]\n"
+      "ldr s28, [x10, #0x0]\n"
       "89:"  // Height 5: Partial accumulate: Done
-      "sub x14, x14, x8\n"
+      "sub x6, x6, x26\n"
       "b 92f\n"
       "90:"  // Height 5: full accumulate
-      "ldr q24, [x14, #0x0]\n"
-      "ldr q25, [x27, #0x0]\n"
-      "ldr q26, [x26, #0x0]\n"
-      "ldr q27, [x25, #0x0]\n"
-      "ldr q28, [x24, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q27, [x11, #0x0]\n"
+      "ldr q28, [x10, #0x0]\n"
       "b 92f\n"
       "91:"  // Height 5: no accumulate
       "movi v24.16b, #0x0\n"
@@ -919,283 +918,283 @@
       "movi v27.16b, #0x0\n"
       "movi v28.16b, #0x0\n"
       "92:"  // Height 5: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "93:"  // Height 5: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 94f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x8, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "ldr x9, [x20, #0x8]\n"
-      "ldr x27, [x20, #0x10]\n"
-      "ldr x25, [x20, #0x18]\n"
-      "ldr x24, [x20, #0x20]\n"
-      "cbnz x13, 95f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x8, LSL #2\n"
-      "add x9, x9, x8, LSL #2\n"
-      "add x27, x27, x8, LSL #2\n"
-      "add x25, x25, x8, LSL #2\n"
-      "add x24, x24, x8, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "ldr x16, [x26, #0x8]\n"
+      "ldr x15, [x26, #0x10]\n"
+      "ldr x14, [x26, #0x18]\n"
+      "ldr x13, [x26, #0x20]\n"
+      "cbnz x7, 95f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
+      "add x16, x16, x26, LSL #2\n"
+      "add x15, x15, x26, LSL #2\n"
+      "add x14, x14, x26, LSL #2\n"
+      "add x13, x13, x26, LSL #2\n"
       "b 95f\n"
       "94:"  // Height 5: setup direct input
-      "mov x11, %x[input_ptr]\n"
-      "add x9, x11, x8, LSL #2\n"
-      "add x27, x9, x8, LSL #2\n"
-      "add x25, x27, x8, LSL #2\n"
-      "add x24, x25, x8, LSL #2\n"
+      "mov x17, %x[input_ptr]\n"
+      "add x16, x17, x27, LSL #2\n"
+      "add x15, x16, x27, LSL #2\n"
+      "add x14, x15, x27, LSL #2\n"
+      "add x13, x14, x27, LSL #2\n"
       "95:"  // Height 5: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 98f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q1, [x9, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x24, #0x0]\n"
-      "ldr q8, [x16, #0x0]\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q1, [x16, #0x0]\n"
+      "ldr q2, [x15, #0x0]\n"
+      "ldr q3, [x14, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 97f\n"
       "96:"  // Height 5: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr x8, [x16, #0x18]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr d10, [x16, #0x20]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "ldr x21, [x16, #0x28]\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "mov v9.d[1], x8\n"
-      "ldr d11, [x16, #0x30]\n"
-      "add x11, x11, #0x10\n"
+      "add x13, x13, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "mov v10.d[1], x21\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "ldr x8, [x16, #0x38]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "ldr x27, [x5, #0x8]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "ldr x10, [x11, #0x8]\n"
+      "ldr x26, [x5, #0x18]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "mov v11.d[1], x8\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x9, x9, #0x10\n"
+      "ldr x12, [x5, #0x28]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "ldr x11, [x17, #0x8]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "ldr x28, [x9, #0x8]\n"
+      "ldr x10, [x16, #0x8]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "add x27, x27, #0x10\n"
+      "ldr x9, [x15, #0x8]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "ldr d10, [x5, #0x20]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
+      "ldr d0, [x17, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d1, [x16, #0x0]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "ldr d2, [x15, #0x0]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "mov v0.d[1], x10\n"
+      "ldr d3, [x14, #0x0]\n"
+      "ldr x28, [x14, #0x8]\n"
       "fmla v28.4s, v11.4s, v4.s[3]\n"
-      "mov v1.d[1], x28\n"
-      "ldr x26, [x27, #0x8]\n"
-      "add x25, x25, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x24, x24, #0x10\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sub x12, x12, #0x4\n"
-      "mov v2.d[1], x26\n"
-      "ldr d3, [x25, #0x0]\n"
-      "ldr x8, [x25, #0x8]\n"
-      "cmp x12, #0x8\n"
-      "ldr d4, [x24, #0x0]\n"
-      "add x16, x16, #0x40\n"
-      "ldr x21, [x24, #0x8]\n"
-      "mov v3.d[1], x8\n"
-      "ldr d8, [x16, #0x0]\n"
-      "ldr x26, [x16, #0x8]\n"
-      "mov v4.d[1], x21\n"
-      "mov v8.d[1], x26\n"
+      "ldr d4, [x13, #0x0]\n"
+      "sub x8, x8, #0x4\n"
+      "ldr d11, [x5, #0x30]\n"
+      "cmp x8, #0x8\n"
+      "mov v8.d[1], x27\n"
+      "ldr x27, [x13, #0x8]\n"
+      "mov v9.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
+      "mov v10.d[1], x12\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
+      "mov v0.d[1], x11\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
+      "mov v1.d[1], x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "mov v2.d[1], x9\n"
+      "mov v3.d[1], x28\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "mov v4.d[1], x27\n"
+      "mov v11.d[1], x26\n"
       "bge 96b\n"
       "97:"  // Height 5: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x16, #0x30]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "sub x12, x12, #0x4\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "add x11, x11, #0x10\n"
+      "add x13, x13, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "sub x8, x8, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "add x9, x9, #0x10\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "add x27, x27, #0x10\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x25, x25, #0x10\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x5, x5, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "add x24, x24, #0x10\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "add x16, x16, #0x40\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
       "fmla v28.4s, v11.4s, v4.s[3]\n"
       "98:"  // Height 5: Multiply loop: Main loop skip
-      "cbz x12, 100f\n"
+      "cbz x8, 100f\n"
       "99:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x24], #0x4\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "cbnz x12, 99b\n"
+      "ldr s21, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr s20, [x16], #0x4\n"
+      "ldr s19, [x15], #0x4\n"
+      "ldr s18, [x14], #0x4\n"
+      "ldr s17, [x13], #0x4\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v21.s[0]\n"
+      "fmla v25.4s, v16.4s, v20.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "fmla v26.4s, v16.4s, v19.s[0]\n"
+      "fmla v27.4s, v16.4s, v18.s[0]\n"
+      "fmla v28.4s, v16.4s, v17.s[0]\n"
+      "cbnz x8, 99b\n"
       "100:"  // Height 5: Multiply loop: No odd multiplies
-      "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x8\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 93b\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "add x27, x14, x8, LSL #2\n"
-      "prfm pstl1keep, [x27, #0x0]\n"
-      "add x26, x27, x8, LSL #2\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x8, LSL #2\n"
-      "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x8, LSL #2\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "add x10, x11, x26, LSL #2\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x12, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x10, #0x0]\n"
       "tbz %x[flags], #1, 101f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x8, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v17.4s }, [x20]\n"
-      "ld1r { v16.4s }, [x8]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
       "fmin v25.4s, v25.4s, v16.4s\n"
       "fmin v26.4s, v26.4s, v16.4s\n"
       "fmin v27.4s, v27.4s, v16.4s\n"
       "fmin v28.4s, v28.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v16.4s\n"
       "101:"  // Height 5: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 104f\n"
-      "tbz x17, #1, 102f\n"
-      "str d24, [x14], #0x8\n"
-      "str d25, [x27], #0x8\n"
-      "str d26, [x26], #0x8\n"
-      "str d27, [x25], #0x8\n"
-      "str d28, [x24], #0x8\n"
-      "tbz x17, #0, 103f\n"
-      "st1 { v24.s }[2], [x14]\n"
-      "st1 { v25.s }[2], [x27]\n"
-      "st1 { v26.s }[2], [x26]\n"
-      "st1 { v27.s }[2], [x25]\n"
-      "st1 { v28.s }[2], [x24]\n"
+      "tbz x4, #1, 102f\n"
+      "str d24, [x6], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x12], #0x8\n"
+      "str d27, [x11], #0x8\n"
+      "str d28, [x10], #0x8\n"
+      "tbz x4, #0, 103f\n"
+      "st1 { v24.s }[2], [x6]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x12]\n"
+      "st1 { v27.s }[2], [x11]\n"
+      "st1 { v28.s }[2], [x10]\n"
       "b 103f\n"
       "102:"  // Height 5: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
-      "str s25, [x27, #0x0]\n"
-      "str s26, [x26, #0x0]\n"
-      "str s27, [x25, #0x0]\n"
-      "str s28, [x24, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x12, #0x0]\n"
+      "str s27, [x11, #0x0]\n"
+      "str s28, [x10, #0x0]\n"
       "103:"  // Height 5: Partial direct writeback: Done
       "b 105f\n"
       "104:"  // Height 5: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q25, [x27, #0x0]\n"
-      "str q26, [x26, #0x0]\n"
-      "str q27, [x25, #0x0]\n"
-      "str q28, [x24, #0x0]\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x12, #0x0]\n"
+      "str q27, [x11, #0x0]\n"
+      "str q28, [x10, #0x0]\n"
       "105:"  // Height 5: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 86b\n"
       "b 170f\n"
       "106:"  // Height 6
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[bias]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
       "107:"  // Height 6: Column loop
-      "cbz x15, 108f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 108f\n"
+      "ldr q24, [x3, #0x0]\n"
       "mov v25.16b, v24.16b\n"
       "mov v26.16b, v24.16b\n"
+      "add x3, x3, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "mov v28.16b, v24.16b\n"
       "mov v29.16b, v24.16b\n"
       "b 113f\n"
       "108:"  // Height 6: no bias
       "tbz %x[flags], #0, 112f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x4\n"
-      "add x27, x14, x8, LSL #2\n"
-      "add x26, x27, x8, LSL #2\n"
-      "add x25, x26, x8, LSL #2\n"
-      "add x24, x25, x8, LSL #2\n"
-      "add x23, x24, x8, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "add x10, x11, x26, LSL #2\n"
+      "cmp x4, #0x4\n"
+      "add x9, x10, x26, LSL #2\n"
       "bge 111f\n"
-      "tbz x17, #1, 109f\n"
-      "ldr d24, [x14], #0x8\n"
-      "ldr d25, [x27], #0x8\n"
-      "mov x8, #0x8\n"
-      "ldr d26, [x26], #0x8\n"
-      "ldr d27, [x25], #0x8\n"
-      "ldr d28, [x24], #0x8\n"
-      "ldr d29, [x23], #0x8\n"
-      "tbz x17, #0, 110f\n"
-      "ld1 { v24.s }[2], [x14]\n"
-      "ld1 { v25.s }[2], [x27]\n"
-      "ld1 { v26.s }[2], [x26]\n"
-      "ld1 { v27.s }[2], [x25]\n"
-      "ld1 { v28.s }[2], [x24]\n"
-      "ld1 { v29.s }[2], [x23]\n"
+      "tbz x4, #1, 109f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x12], #0x8\n"
+      "ldr d27, [x11], #0x8\n"
+      "ldr d28, [x10], #0x8\n"
+      "ldr d29, [x9], #0x8\n"
+      "tbz x4, #0, 110f\n"
+      "ld1 { v24.s }[2], [x6]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x12]\n"
+      "ld1 { v27.s }[2], [x11]\n"
+      "ld1 { v28.s }[2], [x10]\n"
+      "ld1 { v29.s }[2], [x9]\n"
       "b 110f\n"
       "109:"  // Height 6: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x8, #0x0\n"
-      "ldr s25, [x27, #0x0]\n"
-      "ldr s26, [x26, #0x0]\n"
-      "ldr s27, [x25, #0x0]\n"
-      "ldr s28, [x24, #0x0]\n"
-      "ldr s29, [x23, #0x0]\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x12, #0x0]\n"
+      "ldr s27, [x11, #0x0]\n"
+      "ldr s28, [x10, #0x0]\n"
+      "ldr s29, [x9, #0x0]\n"
       "110:"  // Height 6: Partial accumulate: Done
-      "sub x14, x14, x8\n"
+      "sub x6, x6, x26\n"
       "b 113f\n"
       "111:"  // Height 6: full accumulate
-      "ldr q24, [x14, #0x0]\n"
-      "ldr q25, [x27, #0x0]\n"
-      "ldr q26, [x26, #0x0]\n"
-      "ldr q27, [x25, #0x0]\n"
-      "ldr q28, [x24, #0x0]\n"
-      "ldr q29, [x23, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q27, [x11, #0x0]\n"
+      "ldr q28, [x10, #0x0]\n"
+      "ldr q29, [x9, #0x0]\n"
       "b 113f\n"
       "112:"  // Height 6: no accumulate
       "movi v24.16b, #0x0\n"
@@ -1205,154 +1204,154 @@
       "movi v28.16b, #0x0\n"
       "movi v29.16b, #0x0\n"
       "113:"  // Height 6: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "114:"  // Height 6: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 115f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x8, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "ldr x9, [x20, #0x8]\n"
-      "ldr x27, [x20, #0x10]\n"
-      "ldr x25, [x20, #0x18]\n"
-      "ldr x24, [x20, #0x20]\n"
-      "ldr x23, [x20, #0x28]\n"
-      "cbnz x13, 116f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x8, LSL #2\n"
-      "add x9, x9, x8, LSL #2\n"
-      "add x27, x27, x8, LSL #2\n"
-      "add x25, x25, x8, LSL #2\n"
-      "add x24, x24, x8, LSL #2\n"
-      "add x23, x23, x8, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "ldr x16, [x26, #0x8]\n"
+      "ldr x15, [x26, #0x10]\n"
+      "ldr x14, [x26, #0x18]\n"
+      "ldr x13, [x26, #0x20]\n"
+      "ldr x12, [x26, #0x28]\n"
+      "cbnz x7, 116f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
+      "add x16, x16, x26, LSL #2\n"
+      "add x15, x15, x26, LSL #2\n"
+      "add x14, x14, x26, LSL #2\n"
+      "add x13, x13, x26, LSL #2\n"
+      "add x12, x12, x26, LSL #2\n"
       "b 116f\n"
       "115:"  // Height 6: setup direct input
-      "mov x11, %x[input_ptr]\n"
-      "add x9, x11, x8, LSL #2\n"
-      "add x27, x9, x8, LSL #2\n"
-      "add x25, x27, x8, LSL #2\n"
-      "add x24, x25, x8, LSL #2\n"
-      "add x23, x24, x8, LSL #2\n"
+      "mov x17, %x[input_ptr]\n"
+      "add x16, x17, x27, LSL #2\n"
+      "add x15, x16, x27, LSL #2\n"
+      "add x14, x15, x27, LSL #2\n"
+      "add x13, x14, x27, LSL #2\n"
+      "add x12, x13, x27, LSL #2\n"
       "116:"  // Height 6: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 119f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q1, [x9, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x24, #0x0]\n"
-      "ldr q5, [x23, #0x0]\n"
-      "ldr q8, [x16, #0x0]\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q1, [x16, #0x0]\n"
+      "ldr q2, [x15, #0x0]\n"
+      "ldr q3, [x14, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q5, [x12, #0x0]\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 118f\n"
       "117:"  // Height 6: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr x8, [x16, #0x18]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr d10, [x16, #0x20]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "ldr x21, [x16, #0x28]\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "mov v9.d[1], x8\n"
+      "add x13, x13, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "ldr d11, [x16, #0x30]\n"
+      "add x12, x12, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "mov v10.d[1], x21\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "ldr x8, [x16, #0x38]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "add x11, x11, #0x10\n"
+      "ldr x9, [x5, #0x8]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "ldr x28, [x5, #0x18]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "mov v11.d[1], x8\n"
+      "ldr x27, [x5, #0x28]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
-      "ldr x10, [x11, #0x8]\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x9, x9, #0x10\n"
+      "ldr x26, [x17, #0x8]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "ldr x11, [x16, #0x8]\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "ldr x28, [x9, #0x8]\n"
+      "ldr x10, [x15, #0x8]\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "add x27, x27, #0x10\n"
+      "sub x8, x8, #0x4\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "cmp x8, #0x8\n"
       "fmla v29.4s, v10.4s, v5.s[2]\n"
-      "ldr x26, [x27, #0x8]\n"
+      "ldr d10, [x5, #0x20]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
+      "ldr d0, [x17, #0x0]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d1, [x16, #0x0]\n"
       "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "ldr d2, [x15, #0x0]\n"
       "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "mov v0.d[1], x10\n"
+      "ldr d3, [x14, #0x0]\n"
       "fmla v28.4s, v11.4s, v4.s[3]\n"
-      "mov v1.d[1], x28\n"
+      "ldr d4, [x13, #0x0]\n"
       "fmla v29.4s, v11.4s, v5.s[3]\n"
-      "mov v2.d[1], x26\n"
-      "add x25, x25, #0x10\n"
-      "add x24, x24, #0x10\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x23, x23, #0x10\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "sub x12, x12, #0x4\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "cmp x12, #0x8\n"
-      "ldr d3, [x25, #0x0]\n"
-      "add x16, x16, #0x40\n"
-      "ldr x8, [x25, #0x8]\n"
-      "ldr d4, [x24, #0x0]\n"
-      "ldr x21, [x24, #0x8]\n"
-      "mov v3.d[1], x8\n"
-      "ldr d5, [x23, #0x0]\n"
-      "ldr x8, [x23, #0x8]\n"
-      "mov v4.d[1], x21\n"
-      "ldr d8, [x16, #0x0]\n"
-      "ldr x26, [x16, #0x8]\n"
-      "mov v5.d[1], x8\n"
-      "mov v8.d[1], x26\n"
+      "ldr d5, [x12, #0x0]\n"
+      "ldr d11, [x5, #0x30]\n"
+      "mov v8.d[1], x9\n"
+      "ldr x9, [x14, #0x8]\n"
+      "mov v9.d[1], x28\n"
+      "ldr x28, [x13, #0x8]\n"
+      "mov v10.d[1], x27\n"
+      "ldr x27, [x12, #0x8]\n"
+      "mov v0.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "mov v1.d[1], x11\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
+      "mov v2.d[1], x10\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
+      "mov v3.d[1], x9\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
+      "mov v4.d[1], x28\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "mov v5.d[1], x27\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "mov v11.d[1], x26\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "bge 117b\n"
       "118:"  // Height 6: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x16, #0x30]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "sub x12, x12, #0x4\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "add x11, x11, #0x10\n"
+      "add x13, x13, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "add x12, x12, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "add x9, x9, #0x10\n"
+      "sub x8, x8, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "add x27, x27, #0x10\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "add x25, x25, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x5, x5, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "add x23, x23, #0x10\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "add x16, x16, #0x40\n"
       "fmla v29.4s, v10.4s, v5.s[2]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
       "fmla v25.4s, v11.4s, v1.s[3]\n"
@@ -1361,108 +1360,108 @@
       "fmla v28.4s, v11.4s, v4.s[3]\n"
       "fmla v29.4s, v11.4s, v5.s[3]\n"
       "119:"  // Height 6: Multiply loop: Main loop skip
-      "cbz x12, 121f\n"
+      "cbz x8, 121f\n"
       "120:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x24], #0x4\n"
-      "ldr s5, [x23], #0x4\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "fmla v29.4s, v12.4s, v5.s[0]\n"
-      "cbnz x12, 120b\n"
+      "ldr s22, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr s21, [x16], #0x4\n"
+      "ldr s20, [x15], #0x4\n"
+      "ldr s19, [x14], #0x4\n"
+      "ldr s18, [x13], #0x4\n"
+      "ldr s17, [x12], #0x4\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v22.s[0]\n"
+      "fmla v25.4s, v16.4s, v21.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "fmla v26.4s, v16.4s, v20.s[0]\n"
+      "fmla v27.4s, v16.4s, v19.s[0]\n"
+      "fmla v28.4s, v16.4s, v18.s[0]\n"
+      "fmla v29.4s, v16.4s, v17.s[0]\n"
+      "cbnz x8, 120b\n"
       "121:"  // Height 6: Multiply loop: No odd multiplies
-      "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x8\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 114b\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "add x27, x14, x8, LSL #2\n"
-      "prfm pstl1keep, [x27, #0x0]\n"
-      "add x26, x27, x8, LSL #2\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x8, LSL #2\n"
-      "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x8, LSL #2\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x8, LSL #2\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "add x10, x11, x26, LSL #2\n"
+      "add x9, x10, x26, LSL #2\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x12, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x10, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
       "tbz %x[flags], #1, 122f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x8, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v17.4s }, [x20]\n"
-      "ld1r { v16.4s }, [x8]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
       "fmin v25.4s, v25.4s, v16.4s\n"
       "fmin v26.4s, v26.4s, v16.4s\n"
       "fmin v27.4s, v27.4s, v16.4s\n"
       "fmin v28.4s, v28.4s, v16.4s\n"
       "fmin v29.4s, v29.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v16.4s\n"
+      "fmax v29.4s, v29.4s, v16.4s\n"
       "122:"  // Height 6: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 125f\n"
-      "tbz x17, #1, 123f\n"
-      "str d24, [x14], #0x8\n"
-      "str d25, [x27], #0x8\n"
-      "str d26, [x26], #0x8\n"
-      "str d27, [x25], #0x8\n"
-      "str d28, [x24], #0x8\n"
-      "str d29, [x23], #0x8\n"
-      "tbz x17, #0, 124f\n"
-      "st1 { v24.s }[2], [x14]\n"
-      "st1 { v25.s }[2], [x27]\n"
-      "st1 { v26.s }[2], [x26]\n"
-      "st1 { v27.s }[2], [x25]\n"
-      "st1 { v28.s }[2], [x24]\n"
-      "st1 { v29.s }[2], [x23]\n"
+      "tbz x4, #1, 123f\n"
+      "str d24, [x6], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x12], #0x8\n"
+      "str d27, [x11], #0x8\n"
+      "str d28, [x10], #0x8\n"
+      "str d29, [x9], #0x8\n"
+      "tbz x4, #0, 124f\n"
+      "st1 { v24.s }[2], [x6]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x12]\n"
+      "st1 { v27.s }[2], [x11]\n"
+      "st1 { v28.s }[2], [x10]\n"
+      "st1 { v29.s }[2], [x9]\n"
       "b 124f\n"
       "123:"  // Height 6: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
-      "str s25, [x27, #0x0]\n"
-      "str s26, [x26, #0x0]\n"
-      "str s27, [x25, #0x0]\n"
-      "str s28, [x24, #0x0]\n"
-      "str s29, [x23, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x12, #0x0]\n"
+      "str s27, [x11, #0x0]\n"
+      "str s28, [x10, #0x0]\n"
+      "str s29, [x9, #0x0]\n"
       "124:"  // Height 6: Partial direct writeback: Done
       "b 126f\n"
       "125:"  // Height 6: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q25, [x27, #0x0]\n"
-      "str q26, [x26, #0x0]\n"
-      "str q27, [x25, #0x0]\n"
-      "str q28, [x24, #0x0]\n"
-      "str q29, [x23, #0x0]\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x12, #0x0]\n"
+      "str q27, [x11, #0x0]\n"
+      "str q28, [x10, #0x0]\n"
+      "str q29, [x9, #0x0]\n"
       "126:"  // Height 6: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 107b\n"
       "b 170f\n"
       "127:"  // Height 7
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[bias]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
       "128:"  // Height 7: Column loop
-      "cbz x15, 129f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 129f\n"
+      "ldr q24, [x3, #0x0]\n"
       "mov v25.16b, v24.16b\n"
       "mov v26.16b, v24.16b\n"
+      "add x3, x3, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "mov v28.16b, v24.16b\n"
       "mov v29.16b, v24.16b\n"
@@ -1470,53 +1469,53 @@
       "b 134f\n"
       "129:"  // Height 7: no bias
       "tbz %x[flags], #0, 133f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x4\n"
-      "add x27, x14, x8, LSL #2\n"
-      "add x26, x27, x8, LSL #2\n"
-      "add x25, x26, x8, LSL #2\n"
-      "add x24, x25, x8, LSL #2\n"
-      "add x23, x24, x8, LSL #2\n"
-      "add x22, x23, x8, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "add x10, x11, x26, LSL #2\n"
+      "add x9, x10, x26, LSL #2\n"
+      "cmp x4, #0x4\n"
+      "add x28, x9, x26, LSL #2\n"
       "bge 132f\n"
-      "tbz x17, #1, 130f\n"
-      "ldr d24, [x14], #0x8\n"
-      "ldr d25, [x27], #0x8\n"
-      "mov x8, #0x8\n"
-      "ldr d26, [x26], #0x8\n"
-      "ldr d27, [x25], #0x8\n"
-      "ldr d28, [x24], #0x8\n"
-      "ldr d29, [x23], #0x8\n"
-      "ldr d30, [x22], #0x8\n"
-      "tbz x17, #0, 131f\n"
-      "ld1 { v24.s }[2], [x14]\n"
-      "ld1 { v25.s }[2], [x27]\n"
-      "ld1 { v26.s }[2], [x26]\n"
-      "ld1 { v27.s }[2], [x25]\n"
-      "ld1 { v28.s }[2], [x24]\n"
-      "ld1 { v29.s }[2], [x23]\n"
-      "ld1 { v30.s }[2], [x22]\n"
+      "tbz x4, #1, 130f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x12], #0x8\n"
+      "ldr d27, [x11], #0x8\n"
+      "ldr d28, [x10], #0x8\n"
+      "ldr d29, [x9], #0x8\n"
+      "ldr d30, [x28], #0x8\n"
+      "tbz x4, #0, 131f\n"
+      "ld1 { v24.s }[2], [x6]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x12]\n"
+      "ld1 { v27.s }[2], [x11]\n"
+      "ld1 { v28.s }[2], [x10]\n"
+      "ld1 { v29.s }[2], [x9]\n"
+      "ld1 { v30.s }[2], [x28]\n"
       "b 131f\n"
       "130:"  // Height 7: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x8, #0x0\n"
-      "ldr s25, [x27, #0x0]\n"
-      "ldr s26, [x26, #0x0]\n"
-      "ldr s27, [x25, #0x0]\n"
-      "ldr s28, [x24, #0x0]\n"
-      "ldr s29, [x23, #0x0]\n"
-      "ldr s30, [x22, #0x0]\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x12, #0x0]\n"
+      "ldr s27, [x11, #0x0]\n"
+      "ldr s28, [x10, #0x0]\n"
+      "ldr s29, [x9, #0x0]\n"
+      "ldr s30, [x28, #0x0]\n"
       "131:"  // Height 7: Partial accumulate: Done
-      "sub x14, x14, x8\n"
+      "sub x6, x6, x26\n"
       "b 134f\n"
       "132:"  // Height 7: full accumulate
-      "ldr q24, [x14, #0x0]\n"
-      "ldr q25, [x27, #0x0]\n"
-      "ldr q26, [x26, #0x0]\n"
-      "ldr q27, [x25, #0x0]\n"
-      "ldr q28, [x24, #0x0]\n"
-      "ldr q29, [x23, #0x0]\n"
-      "ldr q30, [x22, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q27, [x11, #0x0]\n"
+      "ldr q28, [x10, #0x0]\n"
+      "ldr q29, [x9, #0x0]\n"
+      "ldr q30, [x28, #0x0]\n"
       "b 134f\n"
       "133:"  // Height 7: no accumulate
       "movi v24.16b, #0x0\n"
@@ -1527,171 +1526,171 @@
       "movi v29.16b, #0x0\n"
       "movi v30.16b, #0x0\n"
       "134:"  // Height 7: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "135:"  // Height 7: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 136f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x8, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "ldr x9, [x20, #0x8]\n"
-      "ldr x27, [x20, #0x10]\n"
-      "ldr x25, [x20, #0x18]\n"
-      "ldr x24, [x20, #0x20]\n"
-      "ldr x23, [x20, #0x28]\n"
-      "ldr x22, [x20, #0x30]\n"
-      "cbnz x13, 137f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x8, LSL #2\n"
-      "add x9, x9, x8, LSL #2\n"
-      "add x27, x27, x8, LSL #2\n"
-      "add x25, x25, x8, LSL #2\n"
-      "add x24, x24, x8, LSL #2\n"
-      "add x23, x23, x8, LSL #2\n"
-      "add x22, x22, x8, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "ldr x16, [x26, #0x8]\n"
+      "ldr x15, [x26, #0x10]\n"
+      "ldr x14, [x26, #0x18]\n"
+      "ldr x13, [x26, #0x20]\n"
+      "ldr x12, [x26, #0x28]\n"
+      "ldr x11, [x26, #0x30]\n"
+      "cbnz x7, 137f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
+      "add x16, x16, x26, LSL #2\n"
+      "add x15, x15, x26, LSL #2\n"
+      "add x14, x14, x26, LSL #2\n"
+      "add x13, x13, x26, LSL #2\n"
+      "add x12, x12, x26, LSL #2\n"
+      "add x11, x11, x26, LSL #2\n"
       "b 137f\n"
       "136:"  // Height 7: setup direct input
-      "mov x11, %x[input_ptr]\n"
-      "add x9, x11, x8, LSL #2\n"
-      "add x27, x9, x8, LSL #2\n"
-      "add x25, x27, x8, LSL #2\n"
-      "add x24, x25, x8, LSL #2\n"
-      "add x23, x24, x8, LSL #2\n"
-      "add x22, x23, x8, LSL #2\n"
+      "mov x17, %x[input_ptr]\n"
+      "add x16, x17, x27, LSL #2\n"
+      "add x15, x16, x27, LSL #2\n"
+      "add x14, x15, x27, LSL #2\n"
+      "add x13, x14, x27, LSL #2\n"
+      "add x12, x13, x27, LSL #2\n"
+      "add x11, x12, x27, LSL #2\n"
       "137:"  // Height 7: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 140f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q1, [x9, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x24, #0x0]\n"
-      "ldr q5, [x23, #0x0]\n"
-      "ldr q6, [x22, #0x0]\n"
-      "ldr q8, [x16, #0x0]\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q1, [x16, #0x0]\n"
+      "ldr q2, [x15, #0x0]\n"
+      "ldr q3, [x14, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q5, [x12, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 139f\n"
       "138:"  // Height 7: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr x8, [x16, #0x18]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr d10, [x16, #0x20]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "ldr x21, [x16, #0x28]\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "mov v9.d[1], x8\n"
+      "add x13, x13, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "ldr d11, [x16, #0x30]\n"
+      "add x12, x12, #0x10\n"
       "fmla v30.4s, v8.4s, v6.s[0]\n"
-      "mov v10.d[1], x21\n"
-      "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "ldr x8, [x16, #0x38]\n"
-      "fmla v25.4s, v9.4s, v1.s[1]\n"
       "add x11, x11, #0x10\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "ldr x26, [x5, #0x8]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "mov v11.d[1], x8\n"
+      "ldr x10, [x5, #0x18]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "ldr x10, [x11, #0x8]\n"
+      "ldr x9, [x5, #0x28]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
-      "add x9, x9, #0x10\n"
+      "ldr x28, [x17, #0x8]\n"
       "fmla v30.4s, v9.4s, v6.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "ldr d9, [x5, #0x10]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "ldr x28, [x9, #0x8]\n"
+      "ldr x27, [x16, #0x8]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "add x27, x27, #0x10\n"
+      "sub x8, x8, #0x4\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "cmp x8, #0x8\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "ldr x26, [x27, #0x8]\n"
-      "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "add x25, x25, #0x10\n"
-      "fmla v29.4s, v10.4s, v5.s[2]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v30.4s, v10.4s, v6.s[2]\n"
-      "ldr x8, [x25, #0x8]\n"
-      "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
-      "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
-      "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "mov v0.d[1], x10\n"
-      "fmla v28.4s, v11.4s, v4.s[3]\n"
-      "mov v1.d[1], x28\n"
-      "fmla v29.4s, v11.4s, v5.s[3]\n"
-      "mov v2.d[1], x26\n"
-      "fmla v30.4s, v11.4s, v6.s[3]\n"
-      "ldr d3, [x25, #0x0]\n"
-      "add x24, x24, #0x10\n"
-      "add x23, x23, #0x10\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "add x22, x22, #0x10\n"
-      "mov v3.d[1], x8\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "sub x12, x12, #0x4\n"
-      "ldr d4, [x24, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr x21, [x24, #0x8]\n"
-      "add x16, x16, #0x40\n"
-      "ldr d8, [x16, #0x0]\n"
-      "ldr x26, [x16, #0x8]\n"
-      "mov v4.d[1], x21\n"
-      "ldr d5, [x23, #0x0]\n"
-      "ldr x8, [x23, #0x8]\n"
       "mov v8.d[1], x26\n"
-      "ldr d6, [x22, #0x0]\n"
-      "ldr x21, [x22, #0x8]\n"
-      "mov v5.d[1], x8\n"
-      "mov v6.d[1], x21\n"
+      "fmla v28.4s, v10.4s, v4.s[2]\n"
+      "ldr x26, [x15, #0x8]\n"
+      "fmla v29.4s, v10.4s, v5.s[2]\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
+      "fmla v30.4s, v10.4s, v6.s[2]\n"
+      "ldr d10, [x5, #0x20]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "ldr d0, [x17, #0x0]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "ldr d1, [x16, #0x0]\n"
+      "fmla v26.4s, v11.4s, v2.s[3]\n"
+      "ldr d2, [x15, #0x0]\n"
+      "fmla v27.4s, v11.4s, v3.s[3]\n"
+      "ldr d3, [x14, #0x0]\n"
+      "fmla v28.4s, v11.4s, v4.s[3]\n"
+      "ldr d4, [x13, #0x0]\n"
+      "fmla v29.4s, v11.4s, v5.s[3]\n"
+      "ldr d5, [x12, #0x0]\n"
+      "fmla v30.4s, v11.4s, v6.s[3]\n"
+      "ldr d6, [x11, #0x0]\n"
+      "ldr d11, [x5, #0x30]\n"
+      "mov v9.d[1], x10\n"
+      "ldr x10, [x14, #0x8]\n"
+      "mov v10.d[1], x9\n"
+      "ldr x9, [x13, #0x8]\n"
+      "mov v0.d[1], x28\n"
+      "ldr x28, [x12, #0x8]\n"
+      "mov v1.d[1], x27\n"
+      "ldr x27, [x11, #0x8]\n"
+      "mov v2.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "mov v3.d[1], x10\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
+      "mov v4.d[1], x9\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
+      "mov v5.d[1], x28\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "mov v6.d[1], x27\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "mov v11.d[1], x26\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "bge 138b\n"
       "139:"  // Height 7: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x16, #0x30]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "sub x12, x12, #0x4\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "add x11, x11, #0x10\n"
+      "add x13, x13, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "add x12, x12, #0x10\n"
       "fmla v30.4s, v8.4s, v6.s[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "sub x8, x8, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "add x27, x27, #0x10\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "add x25, x25, #0x10\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
-      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v30.4s, v9.4s, v6.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x23, x23, #0x10\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "add x5, x5, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "add x22, x22, #0x10\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "add x16, x16, #0x40\n"
       "fmla v29.4s, v10.4s, v5.s[2]\n"
       "fmla v30.4s, v10.4s, v6.s[2]\n"
       "fmla v24.4s, v11.4s, v0.s[3]\n"
@@ -1702,50 +1701,48 @@
       "fmla v29.4s, v11.4s, v5.s[3]\n"
       "fmla v30.4s, v11.4s, v6.s[3]\n"
       "140:"  // Height 7: Multiply loop: Main loop skip
-      "cbz x12, 142f\n"
+      "cbz x8, 142f\n"
       "141:"  // Height 7: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x24], #0x4\n"
-      "ldr s5, [x23], #0x4\n"
-      "ldr s6, [x22], #0x4\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "fmla v29.4s, v12.4s, v5.s[0]\n"
-      "fmla v30.4s, v12.4s, v6.s[0]\n"
-      "cbnz x12, 141b\n"
+      "ldr s23, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr s22, [x16], #0x4\n"
+      "ldr s21, [x15], #0x4\n"
+      "ldr s20, [x14], #0x4\n"
+      "ldr s19, [x13], #0x4\n"
+      "ldr s18, [x12], #0x4\n"
+      "ldr s17, [x11], #0x4\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v23.s[0]\n"
+      "fmla v25.4s, v16.4s, v22.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "fmla v26.4s, v16.4s, v21.s[0]\n"
+      "fmla v27.4s, v16.4s, v20.s[0]\n"
+      "fmla v28.4s, v16.4s, v19.s[0]\n"
+      "fmla v29.4s, v16.4s, v18.s[0]\n"
+      "fmla v30.4s, v16.4s, v17.s[0]\n"
+      "cbnz x8, 141b\n"
       "142:"  // Height 7: Multiply loop: No odd multiplies
-      "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x8\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 135b\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "add x27, x14, x8, LSL #2\n"
-      "prfm pstl1keep, [x27, #0x0]\n"
-      "add x26, x27, x8, LSL #2\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x8, LSL #2\n"
-      "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x8, LSL #2\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x8, LSL #2\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x8, LSL #2\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "add x10, x11, x26, LSL #2\n"
+      "add x9, x10, x26, LSL #2\n"
+      "add x28, x9, x26, LSL #2\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x12, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x10, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
       "tbz %x[flags], #1, 143f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x8, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v17.4s }, [x20]\n"
-      "ld1r { v16.4s }, [x8]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
       "fmin v25.4s, v25.4s, v16.4s\n"
       "fmin v26.4s, v26.4s, v16.4s\n"
@@ -1753,70 +1750,72 @@
       "fmin v28.4s, v28.4s, v16.4s\n"
       "fmin v29.4s, v29.4s, v16.4s\n"
       "fmin v30.4s, v30.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
-      "fmax v30.4s, v30.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v16.4s\n"
+      "fmax v29.4s, v29.4s, v16.4s\n"
+      "fmax v30.4s, v30.4s, v16.4s\n"
       "143:"  // Height 7: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 146f\n"
-      "tbz x17, #1, 144f\n"
-      "str d24, [x14], #0x8\n"
-      "str d25, [x27], #0x8\n"
-      "str d26, [x26], #0x8\n"
-      "str d27, [x25], #0x8\n"
-      "str d28, [x24], #0x8\n"
-      "str d29, [x23], #0x8\n"
-      "str d30, [x22], #0x8\n"
-      "tbz x17, #0, 145f\n"
-      "st1 { v24.s }[2], [x14]\n"
-      "st1 { v25.s }[2], [x27]\n"
-      "st1 { v26.s }[2], [x26]\n"
-      "st1 { v27.s }[2], [x25]\n"
-      "st1 { v28.s }[2], [x24]\n"
-      "st1 { v29.s }[2], [x23]\n"
-      "st1 { v30.s }[2], [x22]\n"
+      "tbz x4, #1, 144f\n"
+      "str d24, [x6], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x12], #0x8\n"
+      "str d27, [x11], #0x8\n"
+      "str d28, [x10], #0x8\n"
+      "str d29, [x9], #0x8\n"
+      "str d30, [x28], #0x8\n"
+      "tbz x4, #0, 145f\n"
+      "st1 { v24.s }[2], [x6]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x12]\n"
+      "st1 { v27.s }[2], [x11]\n"
+      "st1 { v28.s }[2], [x10]\n"
+      "st1 { v29.s }[2], [x9]\n"
+      "st1 { v30.s }[2], [x28]\n"
       "b 145f\n"
       "144:"  // Height 7: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
-      "str s25, [x27, #0x0]\n"
-      "str s26, [x26, #0x0]\n"
-      "str s27, [x25, #0x0]\n"
-      "str s28, [x24, #0x0]\n"
-      "str s29, [x23, #0x0]\n"
-      "str s30, [x22, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x12, #0x0]\n"
+      "str s27, [x11, #0x0]\n"
+      "str s28, [x10, #0x0]\n"
+      "str s29, [x9, #0x0]\n"
+      "str s30, [x28, #0x0]\n"
       "145:"  // Height 7: Partial direct writeback: Done
       "b 147f\n"
       "146:"  // Height 7: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q25, [x27, #0x0]\n"
-      "str q26, [x26, #0x0]\n"
-      "str q27, [x25, #0x0]\n"
-      "str q28, [x24, #0x0]\n"
-      "str q29, [x23, #0x0]\n"
-      "str q30, [x22, #0x0]\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x12, #0x0]\n"
+      "str q27, [x11, #0x0]\n"
+      "str q28, [x10, #0x0]\n"
+      "str q29, [x9, #0x0]\n"
+      "str q30, [x28, #0x0]\n"
       "147:"  // Height 7: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 128b\n"
       "b 170f\n"
       "148:"  // Height 8
-      "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x15, %x[bias]\n"
-      "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "mov x8, #0x20\n"
-      "madd %x[output_ptr], x20, x8, %x[output_ptr]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x26, #0x20\n"
+      "mov x3, %x[bias]\n"
+      "ldr x4, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x6, %x[output_ptr]\n"
+      "madd %x[output_ptr], x27, x26, %x[output_ptr]\n"
       "149:"  // Height 8: Column loop
-      "cbz x15, 150f\n"
-      "ldr q24, [x15, #0x0]\n"
-      "add x15, x15, #0x10\n"
+      "cbz x3, 150f\n"
+      "ldr q24, [x3, #0x0]\n"
       "mov v25.16b, v24.16b\n"
       "mov v26.16b, v24.16b\n"
+      "add x3, x3, #0x10\n"
       "mov v27.16b, v24.16b\n"
       "mov v28.16b, v24.16b\n"
       "mov v29.16b, v24.16b\n"
@@ -1825,58 +1824,58 @@
       "b 155f\n"
       "150:"  // Height 8: no bias
       "tbz %x[flags], #0, 154f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "cmp x17, #0x4\n"
-      "add x27, x14, x8, LSL #2\n"
-      "add x26, x27, x8, LSL #2\n"
-      "add x25, x26, x8, LSL #2\n"
-      "add x24, x25, x8, LSL #2\n"
-      "add x23, x24, x8, LSL #2\n"
-      "add x22, x23, x8, LSL #2\n"
-      "add x21, x22, x8, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "add x10, x11, x26, LSL #2\n"
+      "add x9, x10, x26, LSL #2\n"
+      "add x28, x9, x26, LSL #2\n"
+      "cmp x4, #0x4\n"
+      "add x27, x28, x26, LSL #2\n"
       "bge 153f\n"
-      "tbz x17, #1, 151f\n"
-      "ldr d24, [x14], #0x8\n"
-      "ldr d25, [x27], #0x8\n"
-      "mov x8, #0x8\n"
-      "ldr d26, [x26], #0x8\n"
-      "ldr d27, [x25], #0x8\n"
-      "ldr d28, [x24], #0x8\n"
-      "ldr d29, [x23], #0x8\n"
-      "ldr d30, [x22], #0x8\n"
-      "ldr d31, [x21], #0x8\n"
-      "tbz x17, #0, 152f\n"
-      "ld1 { v24.s }[2], [x14]\n"
-      "ld1 { v25.s }[2], [x27]\n"
-      "ld1 { v26.s }[2], [x26]\n"
-      "ld1 { v27.s }[2], [x25]\n"
-      "ld1 { v28.s }[2], [x24]\n"
-      "ld1 { v29.s }[2], [x23]\n"
-      "ld1 { v30.s }[2], [x22]\n"
-      "ld1 { v31.s }[2], [x21]\n"
+      "tbz x4, #1, 151f\n"
+      "ldr d24, [x6], #0x8\n"
+      "mov x26, #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x12], #0x8\n"
+      "ldr d27, [x11], #0x8\n"
+      "ldr d28, [x10], #0x8\n"
+      "ldr d29, [x9], #0x8\n"
+      "ldr d30, [x28], #0x8\n"
+      "ldr d31, [x27], #0x8\n"
+      "tbz x4, #0, 152f\n"
+      "ld1 { v24.s }[2], [x6]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x12]\n"
+      "ld1 { v27.s }[2], [x11]\n"
+      "ld1 { v28.s }[2], [x10]\n"
+      "ld1 { v29.s }[2], [x9]\n"
+      "ld1 { v30.s }[2], [x28]\n"
+      "ld1 { v31.s }[2], [x27]\n"
       "b 152f\n"
       "151:"  // Height 8: Partial accumulate: partial_1_0
-      "ldr s24, [x14, #0x0]\n"
-      "mov x8, #0x0\n"
-      "ldr s25, [x27, #0x0]\n"
-      "ldr s26, [x26, #0x0]\n"
-      "ldr s27, [x25, #0x0]\n"
-      "ldr s28, [x24, #0x0]\n"
-      "ldr s29, [x23, #0x0]\n"
-      "ldr s30, [x22, #0x0]\n"
-      "ldr s31, [x21, #0x0]\n"
+      "ldr s24, [x6, #0x0]\n"
+      "mov x26, #0x0\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x12, #0x0]\n"
+      "ldr s27, [x11, #0x0]\n"
+      "ldr s28, [x10, #0x0]\n"
+      "ldr s29, [x9, #0x0]\n"
+      "ldr s30, [x28, #0x0]\n"
+      "ldr s31, [x27, #0x0]\n"
       "152:"  // Height 8: Partial accumulate: Done
-      "sub x14, x14, x8\n"
+      "sub x6, x6, x26\n"
       "b 155f\n"
       "153:"  // Height 8: full accumulate
-      "ldr q24, [x14, #0x0]\n"
-      "ldr q25, [x27, #0x0]\n"
-      "ldr q26, [x26, #0x0]\n"
-      "ldr q27, [x25, #0x0]\n"
-      "ldr q28, [x24, #0x0]\n"
-      "ldr q29, [x23, #0x0]\n"
-      "ldr q30, [x22, #0x0]\n"
-      "ldr q31, [x21, #0x0]\n"
+      "ldr q24, [x6, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x12, #0x0]\n"
+      "ldr q27, [x11, #0x0]\n"
+      "ldr q28, [x10, #0x0]\n"
+      "ldr q29, [x9, #0x0]\n"
+      "ldr q30, [x28, #0x0]\n"
+      "ldr q31, [x27, #0x0]\n"
       "b 155f\n"
       "154:"  // Height 8: no accumulate
       "movi v24.16b, #0x0\n"
@@ -1888,188 +1887,188 @@
       "movi v30.16b, #0x0\n"
       "movi v31.16b, #0x0\n"
       "155:"  // Height 8: setup done
-      "mov x13, #0x0\n"
+      "mov x7, #0x0\n"
       "156:"  // Height 8: String loop
-      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n"
-      "ldr w12, [x20, x13, LSL #0x2]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w8, [x26, x7, LSL #0x2]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 157f\n"
-      "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
-      "add x20, x20, x8, LSL #3\n"
-      "ldr x11, [x20, #0x0]\n"
-      "ldr x9, [x20, #0x8]\n"
-      "ldr x27, [x20, #0x10]\n"
-      "ldr x25, [x20, #0x18]\n"
-      "ldr x24, [x20, #0x20]\n"
-      "ldr x23, [x20, #0x28]\n"
-      "ldr x22, [x20, #0x30]\n"
-      "ldr x20, [x20, #0x38]\n"
-      "cbnz x13, 158f\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x11, x11, x8, LSL #2\n"
-      "add x9, x9, x8, LSL #2\n"
-      "add x27, x27, x8, LSL #2\n"
-      "add x25, x25, x8, LSL #2\n"
-      "add x24, x24, x8, LSL #2\n"
-      "add x23, x23, x8, LSL #2\n"
-      "add x22, x22, x8, LSL #2\n"
-      "add x20, x20, x8, LSL #2\n"
+      "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n"
+      "add x26, x26, x27, LSL #3\n"
+      "ldr x17, [x26, #0x0]\n"
+      "ldr x16, [x26, #0x8]\n"
+      "ldr x15, [x26, #0x10]\n"
+      "ldr x14, [x26, #0x18]\n"
+      "ldr x13, [x26, #0x20]\n"
+      "ldr x12, [x26, #0x28]\n"
+      "ldr x11, [x26, #0x30]\n"
+      "ldr x27, [x26, #0x38]\n"
+      "cbnz x7, 158f\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x17, x17, x26, LSL #2\n"
+      "add x16, x16, x26, LSL #2\n"
+      "add x15, x15, x26, LSL #2\n"
+      "add x14, x14, x26, LSL #2\n"
+      "add x13, x13, x26, LSL #2\n"
+      "add x12, x12, x26, LSL #2\n"
+      "add x11, x11, x26, LSL #2\n"
+      "add x27, x27, x26, LSL #2\n"
       "b 158f\n"
       "157:"  // Height 8: setup direct input
-      "mov x11, %x[input_ptr]\n"
-      "add x9, x11, x8, LSL #2\n"
-      "add x27, x9, x8, LSL #2\n"
-      "add x25, x27, x8, LSL #2\n"
-      "add x24, x25, x8, LSL #2\n"
-      "add x23, x24, x8, LSL #2\n"
-      "add x22, x23, x8, LSL #2\n"
-      "add x20, x22, x8, LSL #2\n"
+      "mov x17, %x[input_ptr]\n"
+      "add x16, x17, x27, LSL #2\n"
+      "add x15, x16, x27, LSL #2\n"
+      "add x14, x15, x27, LSL #2\n"
+      "add x13, x14, x27, LSL #2\n"
+      "add x12, x13, x27, LSL #2\n"
+      "add x11, x12, x27, LSL #2\n"
+      "add x27, x11, x27, LSL #2\n"
       "158:"  // Height 8: input setup done
-      "cmp x12, #0x4\n"
+      "cmp x8, #0x4\n"
       "blt 161f\n"
-      "ldr q0, [x11, #0x0]\n"
-      "ldr q1, [x9, #0x0]\n"
-      "cmp x12, #0x8\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x24, #0x0]\n"
-      "ldr q5, [x23, #0x0]\n"
-      "ldr q6, [x22, #0x0]\n"
-      "ldr q7, [x20, #0x0]\n"
-      "ldr q8, [x16, #0x0]\n"
+      "ldr q0, [x17, #0x0]\n"
+      "cmp x8, #0x8\n"
+      "ldr q1, [x16, #0x0]\n"
+      "ldr q2, [x15, #0x0]\n"
+      "ldr q3, [x14, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q5, [x12, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      "ldr q7, [x27, #0x0]\n"
+      "ldr q8, [x5, #0x0]\n"
+      "ldr q9, [x5, #0x10]\n"
+      "ldr q10, [x5, #0x20]\n"
+      "ldr q11, [x5, #0x30]\n"
       "blt 160f\n"
       "159:"  // Height 8: Multiply loop: Main loop head
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr d9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr x8, [x16, #0x18]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr d10, [x16, #0x20]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "ldr x21, [x16, #0x28]\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "mov v9.d[1], x8\n"
+      "add x13, x13, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "ldr d11, [x16, #0x30]\n"
+      "add x12, x12, #0x10\n"
       "fmla v30.4s, v8.4s, v6.s[0]\n"
-      "mov v10.d[1], x21\n"
-      "fmla v31.4s, v8.4s, v7.s[0]\n"
-      "ldr x8, [x16, #0x38]\n"
-      "fmla v24.4s, v9.4s, v0.s[1]\n"
       "add x11, x11, #0x10\n"
-      "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
-      "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "mov v11.d[1], x8\n"
-      "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "ldr x10, [x11, #0x8]\n"
-      "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "add x9, x9, #0x10\n"
-      "fmla v29.4s, v9.4s, v5.s[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "fmla v30.4s, v9.4s, v6.s[1]\n"
-      "ldr x28, [x9, #0x8]\n"
-      "fmla v31.4s, v9.4s, v7.s[1]\n"
+      "fmla v31.4s, v8.4s, v7.s[0]\n"
       "add x27, x27, #0x10\n"
-      "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "ldr x26, [x27, #0x8]\n"
-      "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "add x25, x25, #0x10\n"
-      "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "ldr x8, [x25, #0x8]\n"
-      "fmla v29.4s, v10.4s, v5.s[2]\n"
-      "add x24, x24, #0x10\n"
-      "fmla v30.4s, v10.4s, v6.s[2]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "fmla v31.4s, v10.4s, v7.s[2]\n"
-      "ldr x21, [x24, #0x8]\n"
-      "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr d0, [x11, #0x0]\n"
-      "fmla v25.4s, v11.4s, v1.s[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      "fmla v26.4s, v11.4s, v2.s[3]\n"
-      "ldr d2, [x27, #0x0]\n"
-      "fmla v27.4s, v11.4s, v3.s[3]\n"
-      "mov v0.d[1], x10\n"
-      "fmla v28.4s, v11.4s, v4.s[3]\n"
-      "mov v1.d[1], x28\n"
-      "fmla v29.4s, v11.4s, v5.s[3]\n"
-      "mov v2.d[1], x26\n"
-      "fmla v30.4s, v11.4s, v6.s[3]\n"
-      "ldr d3, [x25, #0x0]\n"
-      "fmla v31.4s, v11.4s, v7.s[3]\n"
-      "ldr d4, [x24, #0x0]\n"
-      "add x23, x23, #0x10\n"
-      "add x22, x22, #0x10\n"
-      "mov v3.d[1], x8\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "mov v4.d[1], x21\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "ldr d5, [x23, #0x0]\n"
-      "add x20, x20, #0x10\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "sub x12, x12, #0x4\n"
-      "ldr x8, [x23, #0x8]\n"
-      "cmp x12, #0x8\n"
-      "ldr d6, [x22, #0x0]\n"
-      "add x16, x16, #0x40\n"
-      "ldr d8, [x16, #0x0]\n"
-      "mov v5.d[1], x8\n"
-      "ldr x26, [x16, #0x8]\n"
-      "ldr x21, [x22, #0x8]\n"
-      "ldr d7, [x20, #0x0]\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "add x5, x5, #0x40\n"
+      "ldr d8, [x5, #0x0]\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "fmla v26.4s, v9.4s, v2.s[1]\n"
+      "ldr x26, [x5, #0x8]\n"
+      "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "sub x8, x8, #0x4\n"
+      "fmla v28.4s, v9.4s, v4.s[1]\n"
+      "cmp x8, #0x8\n"
+      "fmla v29.4s, v9.4s, v5.s[1]\n"
       "mov v8.d[1], x26\n"
-      "ldr x8, [x20, #0x8]\n"
-      "mov v6.d[1], x21\n"
-      "mov v7.d[1], x8\n"
+      "fmla v30.4s, v9.4s, v6.s[1]\n"
+      "ldr x26, [x5, #0x18]\n"
+      "fmla v31.4s, v9.4s, v7.s[1]\n"
+      "ldr d9, [x5, #0x10]\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
+      "fmla v26.4s, v10.4s, v2.s[2]\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
+      "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "mov v9.d[1], x26\n"
+      "fmla v28.4s, v10.4s, v4.s[2]\n"
+      "ldr x26, [x5, #0x28]\n"
+      "fmla v29.4s, v10.4s, v5.s[2]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v30.4s, v10.4s, v6.s[2]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
+      "fmla v31.4s, v10.4s, v7.s[2]\n"
+      "ldr d10, [x5, #0x20]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "ldr d0, [x17, #0x0]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "ldr d1, [x16, #0x0]\n"
+      "fmla v26.4s, v11.4s, v2.s[3]\n"
+      "ldr d2, [x15, #0x0]\n"
+      "fmla v27.4s, v11.4s, v3.s[3]\n"
+      "ldr d3, [x14, #0x0]\n"
+      "fmla v28.4s, v11.4s, v4.s[3]\n"
+      "ldr d4, [x13, #0x0]\n"
+      "fmla v29.4s, v11.4s, v5.s[3]\n"
+      "ldr d5, [x12, #0x0]\n"
+      "fmla v30.4s, v11.4s, v6.s[3]\n"
+      "ldr d6, [x11, #0x0]\n"
+      "fmla v31.4s, v11.4s, v7.s[3]\n"
+      "ldr d7, [x27, #0x0]\n"
+      "ldr d11, [x5, #0x30]\n"
+      "mov v10.d[1], x26\n"
+      "ldr x26, [x17, #0x8]\n"
+      "mov v0.d[1], x26\n"
+      "ldr x26, [x16, #0x8]\n"
+      "mov v1.d[1], x26\n"
+      "ldr x26, [x15, #0x8]\n"
+      "mov v2.d[1], x26\n"
+      "ldr x26, [x14, #0x8]\n"
+      "mov v3.d[1], x26\n"
+      "ldr x26, [x13, #0x8]\n"
+      "mov v4.d[1], x26\n"
+      "ldr x26, [x12, #0x8]\n"
+      "mov v5.d[1], x26\n"
+      "ldr x26, [x11, #0x8]\n"
+      "mov v6.d[1], x26\n"
+      "ldr x26, [x27, #0x8]\n"
+      "mov v7.d[1], x26\n"
+      "ldr x26, [x5, #0x38]\n"
+      "mov v11.d[1], x26\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "bge 159b\n"
       "160:"  // Height 8: Multiply loop: Single iteration only
       "fmla v24.4s, v8.4s, v0.s[0]\n"
-      "ldr q9, [x16, #0x10]\n"
+      "add x17, x17, #0x10\n"
       "fmla v25.4s, v8.4s, v1.s[0]\n"
-      "ldr q10, [x16, #0x20]\n"
+      "add x16, x16, #0x10\n"
       "fmla v26.4s, v8.4s, v2.s[0]\n"
-      "ldr q11, [x16, #0x30]\n"
+      "add x15, x15, #0x10\n"
       "fmla v27.4s, v8.4s, v3.s[0]\n"
-      "sub x12, x12, #0x4\n"
+      "add x14, x14, #0x10\n"
       "fmla v28.4s, v8.4s, v4.s[0]\n"
-      "add x11, x11, #0x10\n"
+      "add x13, x13, #0x10\n"
       "fmla v29.4s, v8.4s, v5.s[0]\n"
-      "prfm pldl1keep, [x11, #0x80]\n"
+      "add x12, x12, #0x10\n"
       "fmla v30.4s, v8.4s, v6.s[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x11, x11, #0x10\n"
       "fmla v31.4s, v8.4s, v7.s[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "fmla v24.4s, v9.4s, v0.s[1]\n"
       "add x27, x27, #0x10\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "sub x8, x8, #0x4\n"
       "fmla v25.4s, v9.4s, v1.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x17, #0x80]\n"
       "fmla v26.4s, v9.4s, v2.s[1]\n"
-      "add x25, x25, #0x10\n"
+      "prfm pldl1keep, [x16, #0x80]\n"
       "fmla v27.4s, v9.4s, v3.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x15, #0x80]\n"
       "fmla v28.4s, v9.4s, v4.s[1]\n"
-      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
       "fmla v29.4s, v9.4s, v5.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x13, #0x80]\n"
       "fmla v30.4s, v9.4s, v6.s[1]\n"
-      "add x23, x23, #0x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "fmla v31.4s, v9.4s, v7.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "fmla v24.4s, v10.4s, v0.s[2]\n"
-      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "fmla v25.4s, v10.4s, v1.s[2]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "add x5, x5, #0x40\n"
       "fmla v26.4s, v10.4s, v2.s[2]\n"
-      "add x20, x20, #0x10\n"
       "fmla v27.4s, v10.4s, v3.s[2]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "fmla v28.4s, v10.4s, v4.s[2]\n"
-      "add x16, x16, #0x40\n"
       "fmla v29.4s, v10.4s, v5.s[2]\n"
       "fmla v30.4s, v10.4s, v6.s[2]\n"
       "fmla v31.4s, v10.4s, v7.s[2]\n"
@@ -2082,54 +2081,52 @@
       "fmla v30.4s, v11.4s, v6.s[3]\n"
       "fmla v31.4s, v11.4s, v7.s[3]\n"
       "161:"  // Height 8: Multiply loop: Main loop skip
-      "cbz x12, 163f\n"
+      "cbz x8, 163f\n"
       "162:"  // Height 8: Multiply loop: Odd block loop
-      "ldr s0, [x11], #0x4\n"
-      "sub x12, x12, #0x1\n"
-      "ldr s1, [x9], #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x24], #0x4\n"
-      "ldr s5, [x23], #0x4\n"
-      "ldr s6, [x22], #0x4\n"
-      "ldr s7, [x20], #0x4\n"
-      "ldr q12, [x16, #0x0]\n"
-      "add x16, x16, #0x10\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "fmla v29.4s, v12.4s, v5.s[0]\n"
-      "fmla v30.4s, v12.4s, v6.s[0]\n"
-      "fmla v31.4s, v12.4s, v7.s[0]\n"
-      "cbnz x12, 162b\n"
+      "ldr s0, [x17], #0x4\n"
+      "sub x8, x8, #0x1\n"
+      "ldr s23, [x16], #0x4\n"
+      "ldr s22, [x15], #0x4\n"
+      "ldr s21, [x14], #0x4\n"
+      "ldr s20, [x13], #0x4\n"
+      "ldr s19, [x12], #0x4\n"
+      "ldr s18, [x11], #0x4\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr q16, [x5, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "fmla v25.4s, v16.4s, v23.s[0]\n"
+      "add x5, x5, #0x10\n"
+      "fmla v26.4s, v16.4s, v22.s[0]\n"
+      "fmla v27.4s, v16.4s, v21.s[0]\n"
+      "fmla v28.4s, v16.4s, v20.s[0]\n"
+      "fmla v29.4s, v16.4s, v19.s[0]\n"
+      "fmla v30.4s, v16.4s, v18.s[0]\n"
+      "fmla v31.4s, v16.4s, v17.s[0]\n"
+      "cbnz x8, 162b\n"
       "163:"  // Height 8: Multiply loop: No odd multiplies
-      "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x13, x13, #0x1\n"
-      "cmp x13, x8\n"
+      "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x7, x7, #0x1\n"
+      "cmp x7, x26\n"
       "bne 156b\n"
-      "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "add x27, x14, x8, LSL #2\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x13, x6, x26, LSL #2\n"
+      "add x12, x13, x26, LSL #2\n"
+      "add x11, x12, x26, LSL #2\n"
+      "add x10, x11, x26, LSL #2\n"
+      "add x9, x10, x26, LSL #2\n"
+      "add x28, x9, x26, LSL #2\n"
+      "add x27, x28, x26, LSL #2\n"
+      "prfm pstl1keep, [x6, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x12, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x10, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
       "prfm pstl1keep, [x27, #0x0]\n"
-      "add x26, x27, x8, LSL #2\n"
-      "prfm pstl1keep, [x26, #0x0]\n"
-      "add x25, x26, x8, LSL #2\n"
-      "prfm pstl1keep, [x25, #0x0]\n"
-      "add x24, x25, x8, LSL #2\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
-      "add x23, x24, x8, LSL #2\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
-      "add x22, x23, x8, LSL #2\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
-      "add x21, x22, x8, LSL #2\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "tbz %x[flags], #1, 164f\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
-      "add x8, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v17.4s }, [x20]\n"
-      "ld1r { v16.4s }, [x8]\n"
+      "add x26, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x26]\n"
       "fmin v24.4s, v24.4s, v16.4s\n"
       "fmin v25.4s, v25.4s, v16.4s\n"
       "fmin v26.4s, v26.4s, v16.4s\n"
@@ -2138,76 +2135,77 @@
       "fmin v29.4s, v29.4s, v16.4s\n"
       "fmin v30.4s, v30.4s, v16.4s\n"
       "fmin v31.4s, v31.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
-      "fmax v30.4s, v30.4s, v17.4s\n"
-      "fmax v31.4s, v31.4s, v17.4s\n"
+      "add x26, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v16.4s }, [x26]\n"
+      "fmax v24.4s, v24.4s, v16.4s\n"
+      "fmax v25.4s, v25.4s, v16.4s\n"
+      "fmax v26.4s, v26.4s, v16.4s\n"
+      "fmax v27.4s, v27.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v16.4s\n"
+      "fmax v29.4s, v29.4s, v16.4s\n"
+      "fmax v30.4s, v30.4s, v16.4s\n"
+      "fmax v31.4s, v31.4s, v16.4s\n"
       "164:"  // Height 8: No activation
-      "cmp x17, #0x4\n"
+      "cmp x4, #0x4\n"
       "bge 167f\n"
-      "tbz x17, #1, 165f\n"
-      "str d24, [x14], #0x8\n"
-      "str d25, [x27], #0x8\n"
-      "str d26, [x26], #0x8\n"
-      "str d27, [x25], #0x8\n"
-      "str d28, [x24], #0x8\n"
-      "str d29, [x23], #0x8\n"
-      "str d30, [x22], #0x8\n"
-      "str d31, [x21], #0x8\n"
-      "tbz x17, #0, 166f\n"
-      "st1 { v24.s }[2], [x14]\n"
-      "st1 { v25.s }[2], [x27]\n"
-      "st1 { v26.s }[2], [x26]\n"
-      "st1 { v27.s }[2], [x25]\n"
-      "st1 { v28.s }[2], [x24]\n"
-      "st1 { v29.s }[2], [x23]\n"
-      "st1 { v30.s }[2], [x22]\n"
-      "st1 { v31.s }[2], [x21]\n"
+      "tbz x4, #1, 165f\n"
+      "str d24, [x6], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x12], #0x8\n"
+      "str d27, [x11], #0x8\n"
+      "str d28, [x10], #0x8\n"
+      "str d29, [x9], #0x8\n"
+      "str d30, [x28], #0x8\n"
+      "str d31, [x27], #0x8\n"
+      "tbz x4, #0, 166f\n"
+      "st1 { v24.s }[2], [x6]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x12]\n"
+      "st1 { v27.s }[2], [x11]\n"
+      "st1 { v28.s }[2], [x10]\n"
+      "st1 { v29.s }[2], [x9]\n"
+      "st1 { v30.s }[2], [x28]\n"
+      "st1 { v31.s }[2], [x27]\n"
       "b 166f\n"
       "165:"  // Height 8: Partial direct writeback: partial_1_0
-      "str s24, [x14, #0x0]\n"
-      "str s25, [x27, #0x0]\n"
-      "str s26, [x26, #0x0]\n"
-      "str s27, [x25, #0x0]\n"
-      "str s28, [x24, #0x0]\n"
-      "str s29, [x23, #0x0]\n"
-      "str s30, [x22, #0x0]\n"
-      "str s31, [x21, #0x0]\n"
+      "str s24, [x6, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x12, #0x0]\n"
+      "str s27, [x11, #0x0]\n"
+      "str s28, [x10, #0x0]\n"
+      "str s29, [x9, #0x0]\n"
+      "str s30, [x28, #0x0]\n"
+      "str s31, [x27, #0x0]\n"
       "166:"  // Height 8: Partial direct writeback: Done
       "b 168f\n"
       "167:"  // Height 8: Full writeback
-      "str q24, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q25, [x27, #0x0]\n"
-      "str q26, [x26, #0x0]\n"
-      "str q27, [x25, #0x0]\n"
-      "str q28, [x24, #0x0]\n"
-      "str q29, [x23, #0x0]\n"
-      "str q30, [x22, #0x0]\n"
-      "str q31, [x21, #0x0]\n"
+      "str q24, [x6, #0x0]\n"
+      "add x6, x6, #0x10\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x12, #0x0]\n"
+      "str q27, [x11, #0x0]\n"
+      "str q28, [x10, #0x0]\n"
+      "str q29, [x9, #0x0]\n"
+      "str q30, [x28, #0x0]\n"
+      "str q31, [x27, #0x0]\n"
       "168:"  // Height 8: Writeback done
-      "subs x17, x17, #0x4\n"
+      "subs x4, x4, #0x4\n"
       "bgt 149b\n"
       "subs %x[M], %x[M], #0x8\n"
       "beq 170f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 169f\n"
-      "add x20, x20, #0x8\n"
-      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "add x27, x27, #0x8\n"
+      "str x27, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "b 1b\n"
       "169:"  // Update direct input
-      "mov x8, #0x20\n"
-      "madd %x[input_ptr], x8, x20, %x[input_ptr]\n"
+      "mov x26, #0x20\n"
+      "madd %x[input_ptr], x26, x27, %x[input_ptr]\n"
       "b 1b\n"
       "170:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x8", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
index bd22336..004e5d7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
@@ -92,7 +92,6 @@
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x8\n"
       "bge 148f\n"
@@ -140,11 +139,11 @@
       "9:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 10f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
       "cbnz x10, 11f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -189,10 +188,10 @@
       "14:"  // Height 1: Multiply loop: Main loop skip
       "cbz x9, 16f\n"
       "15:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x28], #0x4\n"
-      "ldr q12, [x12, #0x0]\n"
+      "ldr s17, [x28], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
       "sub x9, x9, #0x1\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "fmla v24.4s, v16.4s, v17.s[0]\n"
       "add x12, x12, #0x10\n"
       "cbnz x9, 15b\n"
       "16:"  // Height 1: Multiply loop: No odd multiplies
@@ -271,12 +270,12 @@
       "30:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 31f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
       "cbnz x10, 32f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -284,7 +283,7 @@
       "b 32f\n"
       "31:"  // Height 2: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
       "32:"  // Height 2: input setup done
       "cmp x9, #0x4\n"
       "blt 35f\n"
@@ -337,12 +336,12 @@
       "35:"  // Height 2: Multiply loop: Main loop skip
       "cbz x9, 37f\n"
       "36:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x28], #0x4\n"
-      "ldr s1, [x27], #0x4\n"
+      "ldr s18, [x28], #0x4\n"
+      "ldr s17, [x27], #0x4\n"
       "sub x9, x9, #0x1\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v24.4s, v16.4s, v18.s[0]\n"
+      "fmla v25.4s, v16.4s, v17.s[0]\n"
       "add x12, x12, #0x10\n"
       "cbnz x9, 36b\n"
       "37:"  // Height 2: Multiply loop: No odd multiplies
@@ -437,13 +436,13 @@
       "51:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 52f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
       "cbnz x10, 53f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -452,8 +451,8 @@
       "b 53f\n"
       "52:"  // Height 3: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
       "53:"  // Height 3: input setup done
       "cmp x9, #0x4\n"
       "blt 56f\n"
@@ -520,14 +519,14 @@
       "56:"  // Height 3: Multiply loop: Main loop skip
       "cbz x9, 58f\n"
       "57:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x28], #0x4\n"
-      "ldr s1, [x27], #0x4\n"
+      "ldr s19, [x28], #0x4\n"
+      "ldr s18, [x27], #0x4\n"
       "sub x9, x9, #0x1\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
+      "ldr s17, [x26], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v24.4s, v16.4s, v19.s[0]\n"
+      "fmla v25.4s, v16.4s, v18.s[0]\n"
+      "fmla v26.4s, v16.4s, v17.s[0]\n"
       "add x12, x12, #0x10\n"
       "cbnz x9, 57b\n"
       "58:"  // Height 3: Multiply loop: No odd multiplies
@@ -637,14 +636,14 @@
       "72:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 73f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
       "cbnz x10, 74f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -654,9 +653,9 @@
       "b 74f\n"
       "73:"  // Height 4: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
       "74:"  // Height 4: input setup done
       "cmp x9, #0x4\n"
       "blt 77f\n"
@@ -737,17 +736,17 @@
       "77:"  // Height 4: Multiply loop: Main loop skip
       "cbz x9, 79f\n"
       "78:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x28], #0x4\n"
-      "ldr s1, [x27], #0x4\n"
+      "ldr s20, [x28], #0x4\n"
+      "ldr s19, [x27], #0x4\n"
       "sub x9, x9, #0x1\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr s18, [x26], #0x4\n"
+      "ldr s17, [x25], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v24.4s, v16.4s, v20.s[0]\n"
+      "fmla v25.4s, v16.4s, v19.s[0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
+      "fmla v26.4s, v16.4s, v18.s[0]\n"
+      "fmla v27.4s, v16.4s, v17.s[0]\n"
       "cbnz x9, 78b\n"
       "79:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -871,15 +870,15 @@
       "93:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 94f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x24, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
       "cbnz x10, 95f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -890,10 +889,10 @@
       "b 95f\n"
       "94:"  // Height 5: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "95:"  // Height 5: input setup done
       "cmp x9, #0x4\n"
       "blt 98f\n"
@@ -988,19 +987,19 @@
       "98:"  // Height 5: Multiply loop: Main loop skip
       "cbz x9, 100f\n"
       "99:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x28], #0x4\n"
-      "ldr s1, [x27], #0x4\n"
+      "ldr s21, [x28], #0x4\n"
+      "ldr s20, [x27], #0x4\n"
       "sub x9, x9, #0x1\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x24], #0x4\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
+      "ldr s19, [x26], #0x4\n"
+      "ldr s18, [x25], #0x4\n"
+      "ldr s17, [x24], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v24.4s, v16.4s, v21.s[0]\n"
+      "fmla v25.4s, v16.4s, v20.s[0]\n"
+      "fmla v26.4s, v16.4s, v19.s[0]\n"
+      "fmla v27.4s, v16.4s, v18.s[0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
+      "fmla v28.4s, v16.4s, v17.s[0]\n"
       "cbnz x9, 99b\n"
       "100:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1139,16 +1138,16 @@
       "114:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 115f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x24, [x21, #0x20]\n"
-      "ldr x23, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
       "cbnz x10, 116f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -1160,11 +1159,11 @@
       "b 116f\n"
       "115:"  // Height 6: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "116:"  // Height 6: input setup done
       "cmp x9, #0x4\n"
       "blt 119f\n"
@@ -1273,21 +1272,21 @@
       "119:"  // Height 6: Multiply loop: Main loop skip
       "cbz x9, 121f\n"
       "120:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x28], #0x4\n"
-      "ldr s1, [x27], #0x4\n"
+      "ldr s22, [x28], #0x4\n"
+      "ldr s21, [x27], #0x4\n"
       "sub x9, x9, #0x1\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x24], #0x4\n"
-      "ldr s5, [x23], #0x4\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr s20, [x26], #0x4\n"
+      "ldr s19, [x25], #0x4\n"
+      "ldr s18, [x24], #0x4\n"
+      "ldr s17, [x23], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v24.4s, v16.4s, v22.s[0]\n"
+      "fmla v25.4s, v16.4s, v21.s[0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "fmla v29.4s, v12.4s, v5.s[0]\n"
+      "fmla v26.4s, v16.4s, v20.s[0]\n"
+      "fmla v27.4s, v16.4s, v19.s[0]\n"
+      "fmla v28.4s, v16.4s, v18.s[0]\n"
+      "fmla v29.4s, v16.4s, v17.s[0]\n"
       "cbnz x9, 120b\n"
       "121:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1441,17 +1440,17 @@
       "135:"  // Height 7: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 136f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x24, [x21, #0x20]\n"
-      "ldr x23, [x21, #0x28]\n"
-      "ldr x22, [x21, #0x30]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
       "cbnz x10, 137f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -1464,12 +1463,12 @@
       "b 137f\n"
       "136:"  // Height 7: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "137:"  // Height 7: input setup done
       "cmp x9, #0x4\n"
       "blt 140f\n"
@@ -1592,23 +1591,23 @@
       "140:"  // Height 7: Multiply loop: Main loop skip
       "cbz x9, 142f\n"
       "141:"  // Height 7: Multiply loop: Odd block loop
-      "ldr s0, [x28], #0x4\n"
-      "ldr s1, [x27], #0x4\n"
+      "ldr s23, [x28], #0x4\n"
+      "ldr s22, [x27], #0x4\n"
       "sub x9, x9, #0x1\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x24], #0x4\n"
-      "ldr s5, [x23], #0x4\n"
-      "ldr s6, [x22], #0x4\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
+      "ldr s21, [x26], #0x4\n"
+      "ldr s20, [x25], #0x4\n"
+      "ldr s19, [x24], #0x4\n"
+      "ldr s18, [x23], #0x4\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v24.4s, v16.4s, v23.s[0]\n"
+      "fmla v25.4s, v16.4s, v22.s[0]\n"
+      "fmla v26.4s, v16.4s, v21.s[0]\n"
+      "fmla v27.4s, v16.4s, v20.s[0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "fmla v29.4s, v12.4s, v5.s[0]\n"
-      "fmla v30.4s, v12.4s, v6.s[0]\n"
+      "fmla v28.4s, v16.4s, v19.s[0]\n"
+      "fmla v29.4s, v16.4s, v18.s[0]\n"
+      "fmla v30.4s, v16.4s, v17.s[0]\n"
       "cbnz x9, 141b\n"
       "142:"  // Height 7: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -1780,18 +1779,18 @@
       "156:"  // Height 8: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 157f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x24, [x21, #0x20]\n"
-      "ldr x23, [x21, #0x28]\n"
-      "ldr x22, [x21, #0x30]\n"
-      "ldr x21, [x21, #0x38]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "ldr x21, [x20, #0x38]\n"
       "cbnz x10, 158f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -1805,13 +1804,13 @@
       "b 158f\n"
       "157:"  // Height 8: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "158:"  // Height 8: input setup done
       "cmp x9, #0x4\n"
       "blt 161f\n"
@@ -1949,24 +1948,24 @@
       "cbz x9, 163f\n"
       "162:"  // Height 8: Multiply loop: Odd block loop
       "ldr s0, [x28], #0x4\n"
-      "ldr s1, [x27], #0x4\n"
+      "ldr s23, [x27], #0x4\n"
       "sub x9, x9, #0x1\n"
-      "ldr s2, [x26], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x24], #0x4\n"
-      "ldr s5, [x23], #0x4\n"
-      "ldr s6, [x22], #0x4\n"
-      "ldr s7, [x21], #0x4\n"
-      "ldr q12, [x12, #0x0]\n"
-      "fmla v24.4s, v12.4s, v0.s[0]\n"
-      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr s22, [x26], #0x4\n"
+      "ldr s21, [x25], #0x4\n"
+      "ldr s20, [x24], #0x4\n"
+      "ldr s19, [x23], #0x4\n"
+      "ldr s18, [x22], #0x4\n"
+      "ldr s17, [x21], #0x4\n"
+      "ldr q16, [x12, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "fmla v25.4s, v16.4s, v23.s[0]\n"
       "add x12, x12, #0x10\n"
-      "fmla v26.4s, v12.4s, v2.s[0]\n"
-      "fmla v27.4s, v12.4s, v3.s[0]\n"
-      "fmla v28.4s, v12.4s, v4.s[0]\n"
-      "fmla v29.4s, v12.4s, v5.s[0]\n"
-      "fmla v30.4s, v12.4s, v6.s[0]\n"
-      "fmla v31.4s, v12.4s, v7.s[0]\n"
+      "fmla v26.4s, v16.4s, v22.s[0]\n"
+      "fmla v27.4s, v16.4s, v21.s[0]\n"
+      "fmla v28.4s, v16.4s, v20.s[0]\n"
+      "fmla v29.4s, v16.4s, v19.s[0]\n"
+      "fmla v30.4s, v16.4s, v18.s[0]\n"
+      "fmla v31.4s, v16.4s, v17.s[0]\n"
       "cbnz x9, 162b\n"
       "163:"  // Height 8: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -2068,10 +2067,9 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "170:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
index e6e7950..f31dd7a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../bfloat.hpp"
 #include "../performance_parameters.hpp"
@@ -99,5 +99,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp
index a0ea968..0e468b1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp
@@ -93,7 +93,6 @@
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x4\n"
       "bge 130f\n"
@@ -255,11 +254,11 @@
       "20:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 21f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
       "cbnz x26, 22f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -279,31 +278,31 @@
       "23:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x28, #0x40]\n"
+      "ldr q24, [x28, #0x40]\n"
       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x28, #0x50]\n"
+      "ldr q23, [x28, #0x50]\n"
       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x28, #0x60]\n"
+      "ldr q22, [x28, #0x60]\n"
       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x28, #0x80]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x28, #0x90]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x28, #0xb0]\n"
+      "ldr q21, [x28, #0x70]\n"
+      ".inst 0x6e58ec0a  // bfmmla v10.4s, v0.8h, v24.8h\n"
+      "ldr q24, [x28, #0x80]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q23, [x28, #0x90]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      "ldr q22, [x28, #0xa0]\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0xb0]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x8\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e58ec0c  // bfmmla v12.4s, v0.8h, v24.8h\n"
       "add x28, x28, #0xc0\n"
       "ldr q4, [x28, #0x0]\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
       "ldr q5, [x28, #0x10]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
       "ldr q6, [x28, #0x20]\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
       "ldr q7, [x28, #0x30]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "ld1 { v0.4s }, [x24], #0x10\n"
@@ -311,28 +310,28 @@
       "24:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x28, #0x40]\n"
+      "ldr q23, [x28, #0x40]\n"
       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x28, #0x50]\n"
+      "ldr q25, [x28, #0x50]\n"
       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x28, #0x60]\n"
+      "ldr q21, [x28, #0x60]\n"
       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x28, #0x80]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x28, #0x90]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x28, #0xb0]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x6e57ec0a  // bfmmla v10.4s, v0.8h, v23.8h\n"
+      "ldr q23, [x28, #0x80]\n"
+      ".inst 0x6e59ec10  // bfmmla v16.4s, v0.8h, v25.8h\n"
+      "ldr q22, [x28, #0x90]\n"
+      ".inst 0x6e55ec0b  // bfmmla v11.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0xa0]\n"
+      ".inst 0x6e58ec11  // bfmmla v17.4s, v0.8h, v24.8h\n"
+      "ldr q5, [x28, #0xb0]\n"
       "sub x25, x25, #0x4\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e57ec0c  // bfmmla v12.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e56ec12  // bfmmla v18.4s, v0.8h, v22.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "add x28, x28, #0xc0\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e55ec0d  // bfmmla v13.4s, v0.8h, v21.8h\n"
+      ".inst 0x6e45ec13  // bfmmla v19.4s, v0.8h, v5.8h\n"
       "25:"  // Height 1: Multiply loop: Main loop skip
       "cbz x25, 28f\n"
       "cbz x25, 28f\n"
@@ -344,31 +343,31 @@
       "26:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr s0, [x24, #0x0]\n"
       "27:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q4, [x28, #0x0]\n"
-      "ldr q5, [x28, #0x10]\n"
+      "ldr q21, [x28, #0x0]\n"
+      "ldr q1, [x28, #0x10]\n"
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
-      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      "ldr q6, [x28, #0x20]\n"
-      "ldr q7, [x28, #0x30]\n"
-      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q4, [x28, #0x40]\n"
-      "ldr q5, [x28, #0x50]\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      "ldr q6, [x28, #0x60]\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      "ldr q4, [x28, #0x80]\n"
-      "ldr q5, [x28, #0x90]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
-      "ldr q6, [x28, #0xa0]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e55ec08  // bfmmla v8.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0x20]\n"
+      "ldr q22, [x28, #0x30]\n"
+      ".inst 0x6e41ec0e  // bfmmla v14.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e55ec09  // bfmmla v9.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0x40]\n"
+      "ldr q23, [x28, #0x50]\n"
+      ".inst 0x6e56ec0f  // bfmmla v15.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec0a  // bfmmla v10.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0x60]\n"
+      "ldr q22, [x28, #0x70]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e55ec0b  // bfmmla v11.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0x80]\n"
+      "ldr q23, [x28, #0x90]\n"
+      ".inst 0x6e56ec11  // bfmmla v17.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec0c  // bfmmla v12.4s, v0.8h, v21.8h\n"
+      "ldr q22, [x28, #0xa0]\n"
+      "ldr q21, [x28, #0xb0]\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
       "add x28, x28, #0xc0\n"
       "28:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -384,21 +383,21 @@
       "uzp1 v13.2d, v13.2d, v19.2d\n"
       "tbz %x[flags], #1, 29f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v22.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v22.4s\n"
+      "fmin v9.4s, v9.4s, v22.4s\n"
+      "fmin v10.4s, v10.4s, v22.4s\n"
+      "fmin v11.4s, v11.4s, v22.4s\n"
+      "fmin v12.4s, v12.4s, v22.4s\n"
+      "fmin v13.4s, v13.4s, v22.4s\n"
+      "fmax v8.4s, v8.4s, v21.4s\n"
+      "fmax v9.4s, v9.4s, v21.4s\n"
+      "fmax v10.4s, v10.4s, v21.4s\n"
+      "fmax v11.4s, v11.4s, v21.4s\n"
+      "fmax v12.4s, v12.4s, v21.4s\n"
+      "fmax v13.4s, v13.4s, v21.4s\n"
       "29:"  // Height 1: No activation
       "cmp x9, #0x18\n"
       "bge 42f\n"
@@ -678,12 +677,12 @@
       "63:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 64f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
       "cbnz x26, 65f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -691,7 +690,7 @@
       "b 65f\n"
       "64:"  // Height 2: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "65:"  // Height 2: input setup done
       "cmp x25, #0x4\n"
       "blt 68f\n"
@@ -707,31 +706,31 @@
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x28, #0x40]\n"
+      "ldr q3, [x28, #0x40]\n"
       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x28, #0x50]\n"
+      "ldr q23, [x28, #0x50]\n"
       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x28, #0x60]\n"
+      "ldr q22, [x28, #0x60]\n"
       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x28, #0x80]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x28, #0x90]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x28, #0xb0]\n"
+      "ldr q21, [x28, #0x70]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      "ldr q1, [x28, #0x80]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q23, [x28, #0x90]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      "ldr q22, [x28, #0xa0]\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0xb0]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x8\n"
       "add x28, x28, #0xc0\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e41ec0c  // bfmmla v12.4s, v0.8h, v1.8h\n"
       "ldr q4, [x28, #0x0]\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
       "ldr q5, [x28, #0x10]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
       "ldr q6, [x28, #0x20]\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
       "ldr q7, [x28, #0x30]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "ld1 { v0.4s }, [x24], #0x10\n"
@@ -742,28 +741,28 @@
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x28, #0x40]\n"
+      "ldr q24, [x28, #0x40]\n"
       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x28, #0x50]\n"
+      "ldr q23, [x28, #0x50]\n"
       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x28, #0x60]\n"
+      "ldr q22, [x28, #0x60]\n"
       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      "ldr q4, [x28, #0x80]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      "ldr q5, [x28, #0x90]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x28, #0xb0]\n"
+      "ldr q21, [x28, #0x70]\n"
+      ".inst 0x6e58ec0a  // bfmmla v10.4s, v0.8h, v24.8h\n"
+      "ldr q24, [x28, #0x80]\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q23, [x28, #0x90]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      "ldr q22, [x28, #0xa0]\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q21, [x28, #0xb0]\n"
       "sub x25, x25, #0x4\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e58ec0c  // bfmmla v12.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
       "add x28, x28, #0xc0\n"
       "68:"  // Height 2: Multiply loop: Main loop skip
       "cbz x25, 71f\n"
@@ -779,32 +778,32 @@
       "ldr s0, [x24, #0x0]\n"
       "ldr s1, [x23, #0x0]\n"
       "70:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q4, [x28, #0x0]\n"
-      "ldr q5, [x28, #0x10]\n"
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q23, [x28, #0x10]\n"
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
-      "ldr q6, [x28, #0x20]\n"
-      "ldr q7, [x28, #0x30]\n"
-      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      "ldr q4, [x28, #0x40]\n"
-      "ldr q5, [x28, #0x50]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      "ldr q6, [x28, #0x60]\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      "ldr q4, [x28, #0x80]\n"
-      "ldr q5, [x28, #0x90]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      "ldr q6, [x28, #0xa0]\n"
-      "ldr q7, [x28, #0xb0]\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      "ldr q22, [x28, #0x20]\n"
+      "ldr q21, [x28, #0x30]\n"
+      ".inst 0x6e58ec08  // bfmmla v8.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec0e  // bfmmla v14.4s, v0.8h, v23.8h\n"
+      "ldr q24, [x28, #0x40]\n"
+      "ldr q23, [x28, #0x50]\n"
+      ".inst 0x6e56ec09  // bfmmla v9.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec0f  // bfmmla v15.4s, v0.8h, v21.8h\n"
+      "ldr q22, [x28, #0x60]\n"
+      "ldr q21, [x28, #0x70]\n"
+      ".inst 0x6e58ec0a  // bfmmla v10.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec10  // bfmmla v16.4s, v0.8h, v23.8h\n"
+      "ldr q24, [x28, #0x80]\n"
+      "ldr q23, [x28, #0x90]\n"
+      ".inst 0x6e56ec0b  // bfmmla v11.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec11  // bfmmla v17.4s, v0.8h, v21.8h\n"
+      "ldr q22, [x28, #0xa0]\n"
+      "ldr q21, [x28, #0xb0]\n"
+      ".inst 0x6e58ec0c  // bfmmla v12.4s, v0.8h, v24.8h\n"
+      ".inst 0x6e57ec12  // bfmmla v18.4s, v0.8h, v23.8h\n"
+      ".inst 0x6e56ec0d  // bfmmla v13.4s, v0.8h, v22.8h\n"
+      ".inst 0x6e55ec13  // bfmmla v19.4s, v0.8h, v21.8h\n"
       "add x28, x28, #0xc0\n"
       "71:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -829,33 +828,33 @@
       "uzp2 v13.2d, v13.2d, v19.2d\n"
       "tbz %x[flags], #1, 72f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v22.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v4.4s, v4.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmax v4.4s, v4.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "fmin v4.4s, v4.4s, v22.4s\n"
+      "fmin v14.4s, v14.4s, v22.4s\n"
+      "fmin v15.4s, v15.4s, v22.4s\n"
+      "fmin v16.4s, v16.4s, v22.4s\n"
+      "fmin v17.4s, v17.4s, v22.4s\n"
+      "fmin v18.4s, v18.4s, v22.4s\n"
+      "fmin v8.4s, v8.4s, v22.4s\n"
+      "fmin v9.4s, v9.4s, v22.4s\n"
+      "fmin v10.4s, v10.4s, v22.4s\n"
+      "fmin v11.4s, v11.4s, v22.4s\n"
+      "fmin v12.4s, v12.4s, v22.4s\n"
+      "fmin v13.4s, v13.4s, v22.4s\n"
+      "fmax v4.4s, v4.4s, v21.4s\n"
+      "fmax v14.4s, v14.4s, v21.4s\n"
+      "fmax v15.4s, v15.4s, v21.4s\n"
+      "fmax v16.4s, v16.4s, v21.4s\n"
+      "fmax v17.4s, v17.4s, v21.4s\n"
+      "fmax v18.4s, v18.4s, v21.4s\n"
+      "fmax v8.4s, v8.4s, v21.4s\n"
+      "fmax v9.4s, v9.4s, v21.4s\n"
+      "fmax v10.4s, v10.4s, v21.4s\n"
+      "fmax v11.4s, v11.4s, v21.4s\n"
+      "fmax v12.4s, v12.4s, v21.4s\n"
+      "fmax v13.4s, v13.4s, v21.4s\n"
       "72:"  // Height 2: No activation
       "cmp x9, #0x18\n"
       "bge 85f\n"
@@ -1238,13 +1237,13 @@
       "106:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 107f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
       "cbnz x26, 108f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -1253,8 +1252,8 @@
       "b 108f\n"
       "107:"  // Height 3: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "108:"  // Height 3: input setup done
       "cmp x25, #0x4\n"
       "blt 111f\n"
@@ -1285,7 +1284,7 @@
       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x28, #0x70]\n"
+      "ldr q3, [x28, #0x70]\n"
       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "ld1 { v1.4s }, [x23], #0x10\n"
@@ -1298,9 +1297,9 @@
       "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
       "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x28, #0xb0]\n"
+      ".inst 0x6e43ec11  // bfmmla v17.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec5d  // bfmmla v29.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x28, #0xb0]\n"
       "add x28, x28, #0xc0\n"
       ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
       ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
@@ -1311,9 +1310,9 @@
       ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
       ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
       "ldr q6, [x28, #0x20]\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec13  // bfmmla v19.4s, v0.8h, v3.8h\n"
       "ld1 { v0.4s }, [x24], #0x10\n"
-      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec5f  // bfmmla v31.4s, v2.8h, v3.8h\n"
       "ld1 { v2.4s }, [x22], #0x10\n"
       "ldr q7, [x28, #0x30]\n"
       "bge 109b\n"
@@ -1324,10 +1323,10 @@
       "sub x25, x25, #0x4\n"
       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
       ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x28, #0x40]\n"
+      "ldr q3, [x28, #0x40]\n"
       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
       ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x50]\n"
       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
@@ -1335,29 +1334,29 @@
       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x28, #0x80]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x28, #0x90]\n"
+      ".inst 0x6e43ec56  // bfmmla v22.4s, v2.8h, v3.8h\n"
+      "ldr q5, [x28, #0x80]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x90]\n"
       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x28, #0xb0]\n"
+      "ldr q3, [x28, #0xa0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x28, #0xb0]\n"
       "add x28, x28, #0xc0\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
       "111:"  // Height 3: Multiply loop: Main loop skip
       "cbz x25, 114f\n"
       "cbz x25, 114f\n"
@@ -1375,46 +1374,46 @@
       "ldr s1, [x23, #0x0]\n"
       "ldr s2, [x22, #0x0]\n"
       "113:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q4, [x28, #0x0]\n"
-      "ldr q5, [x28, #0x10]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q4, [x28, #0x10]\n"
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
-      "ldr q6, [x28, #0x20]\n"
-      "ldr q7, [x28, #0x30]\n"
+      "ldr q3, [x28, #0x20]\n"
+      "ldr q1, [x28, #0x30]\n"
       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
-      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x28, #0x40]\n"
-      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x28, #0x50]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x28, #0x60]\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x28, #0x80]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x28, #0x90]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x28, #0xb0]\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec08  // bfmmla v8.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec54  // bfmmla v20.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x40]\n"
+      ".inst 0x6e44ec0e  // bfmmla v14.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5a  // bfmmla v26.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec55  // bfmmla v21.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x28, #0x60]\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5b  // bfmmla v27.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x6e45ec0a  // bfmmla v10.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec56  // bfmmla v22.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x80]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x90]\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec57  // bfmmla v23.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x28, #0xa0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x28, #0xb0]\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
       "add x28, x28, #0xc0\n"
-      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
       "114:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
@@ -1937,14 +1936,14 @@
       "149:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 150f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
       "cbnz x26, 151f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -1954,9 +1953,9 @@
       "b 151f\n"
       "150:"  // Height 4: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "151:"  // Height 4: input setup done
       "cmp x25, #0x4\n"
       "blt 154f\n"
@@ -2033,39 +2032,39 @@
       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x28, #0x40]\n"
+      "ldr q3, [x28, #0x40]\n"
       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
       "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x50]\n"
       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
       "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
       "ldr q6, [x28, #0x60]\n"
       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
       ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x28, #0x80]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x28, #0x90]\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec56  // bfmmla v22.4s, v2.8h, v3.8h\n"
+      "ldr q5, [x28, #0x80]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x90]\n"
       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x28, #0xb0]\n"
+      "ldr q3, [x28, #0xa0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x28, #0xb0]\n"
       "add x28, x28, #0xc0\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
       "154:"  // Height 4: Multiply loop: Main loop skip
       "cbz x25, 157f\n"
       "cbz x25, 157f\n"
@@ -2086,47 +2085,47 @@
       "ldr s2, [x22, #0x0]\n"
       "ldr s3, [x21, #0x0]\n"
       "156:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q4, [x28, #0x0]\n"
-      "ldr q5, [x28, #0x10]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q4, [x28, #0x10]\n"
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
-      "ldr q6, [x28, #0x20]\n"
-      "ldr q7, [x28, #0x30]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
       ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
-      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x28, #0x40]\n"
-      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x28, #0x50]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x28, #0x60]\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x28, #0x70]\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
-      "ldr q4, [x28, #0x80]\n"
-      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
-      "ldr q5, [x28, #0x90]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x28, #0xa0]\n"
-      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x28, #0xb0]\n"
+      ".inst 0x6e45ec08  // bfmmla v8.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec54  // bfmmla v20.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x40]\n"
+      ".inst 0x6e44ec0e  // bfmmla v14.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5a  // bfmmla v26.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      "ldr q3, [x28, #0x60]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec5b  // bfmmla v27.4s, v2.8h, v6.8h\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x6e45ec0a  // bfmmla v10.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec56  // bfmmla v22.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x80]\n"
+      ".inst 0x6e44ec10  // bfmmla v16.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5c  // bfmmla v28.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x90]\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec57  // bfmmla v23.4s, v2.8h, v3.8h\n"
+      "ldr q3, [x28, #0xa0]\n"
+      ".inst 0x6e41ec11  // bfmmla v17.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5d  // bfmmla v29.4s, v2.8h, v1.8h\n"
+      "ldr q1, [x28, #0xb0]\n"
       "add x28, x28, #0xc0\n"
-      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
-      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec12  // bfmmla v18.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec5e  // bfmmla v30.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e41ec13  // bfmmla v19.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec5f  // bfmmla v31.4s, v2.8h, v1.8h\n"
       "157:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
@@ -2415,7 +2414,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "174:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
index 39ffcbe..71e16d6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../bfloat.hpp"
 #include "../performance_parameters.hpp"
@@ -99,5 +99,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp
index 4993777..5693c3f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp
@@ -93,7 +93,6 @@
             break;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 176f\n"
@@ -211,11 +210,11 @@
       "16:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 17f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 18f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -233,23 +232,23 @@
       "19:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q18, [x10, #0x20]\n"
       ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x8\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
       "add x10, x10, #0x80\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
       "ldr q7, [x10, #0x10]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "ld1 { v0.4s }, [x26], #0x10\n"
@@ -257,20 +256,20 @@
       "20:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q18, [x10, #0x20]\n"
       ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
       "sub x27, x27, #0x4\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "add x10, x10, #0x80\n"
       "21:"  // Height 1: Multiply loop: Main loop skip
@@ -284,23 +283,23 @@
       "22:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr s0, [x26, #0x0]\n"
       "23:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
+      "ldr q18, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x10]\n"
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e52ec08  // bfmmla v8.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ec0c  // bfmmla v12.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
       "add x10, x10, #0x80\n"
       "24:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -314,17 +313,17 @@
       "uzp1 v11.2d, v11.2d, v15.2d\n"
       "tbz %x[flags], #1, 25f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v18.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "fmin v8.4s, v8.4s, v18.4s\n"
+      "fmin v9.4s, v9.4s, v18.4s\n"
+      "fmin v10.4s, v10.4s, v18.4s\n"
+      "fmin v11.4s, v11.4s, v18.4s\n"
+      "fmax v8.4s, v8.4s, v17.4s\n"
+      "fmax v9.4s, v9.4s, v17.4s\n"
+      "fmax v10.4s, v10.4s, v17.4s\n"
+      "fmax v11.4s, v11.4s, v17.4s\n"
       "25:"  // Height 1: No activation
       "cmp x11, #0x10\n"
       "bge 34f\n"
@@ -515,12 +514,12 @@
       "51:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 52f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 53f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -528,7 +527,7 @@
       "b 53f\n"
       "52:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
       "53:"  // Height 2: input setup done
       "cmp x27, #0x4\n"
       "blt 56f\n"
@@ -542,23 +541,23 @@
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
       ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q18, [x10, #0x20]\n"
       ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x8\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
       "ldr q7, [x10, #0x10]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "ld1 { v0.4s }, [x26], #0x10\n"
@@ -569,20 +568,20 @@
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
       ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q18, [x10, #0x20]\n"
       ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
       "sub x27, x27, #0x4\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "add x10, x10, #0x80\n"
@@ -600,24 +599,24 @@
       "ldr s0, [x26, #0x0]\n"
       "ldr s1, [x25, #0x0]\n"
       "58:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
+      "ldr q18, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x10]\n"
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e52ec08  // bfmmla v8.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e51ec0c  // bfmmla v12.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e52ec09  // bfmmla v9.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e51ec0d  // bfmmla v13.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e52ec0a  // bfmmla v10.4s, v0.8h, v18.8h\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e51ec0e  // bfmmla v14.4s, v0.8h, v17.8h\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x6e52ec0b  // bfmmla v11.4s, v0.8h, v18.8h\n"
+      ".inst 0x6e51ec0f  // bfmmla v15.4s, v0.8h, v17.8h\n"
       "add x10, x10, #0x80\n"
       "59:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -638,25 +637,25 @@
       "uzp2 v11.2d, v11.2d, v15.2d\n"
       "tbz %x[flags], #1, 60f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v18.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v6.4s, v6.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmax v6.4s, v6.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "fmin v6.4s, v6.4s, v18.4s\n"
+      "fmin v12.4s, v12.4s, v18.4s\n"
+      "fmin v13.4s, v13.4s, v18.4s\n"
+      "fmin v14.4s, v14.4s, v18.4s\n"
+      "fmin v8.4s, v8.4s, v18.4s\n"
+      "fmin v9.4s, v9.4s, v18.4s\n"
+      "fmin v10.4s, v10.4s, v18.4s\n"
+      "fmin v11.4s, v11.4s, v18.4s\n"
+      "fmax v6.4s, v6.4s, v17.4s\n"
+      "fmax v12.4s, v12.4s, v17.4s\n"
+      "fmax v13.4s, v13.4s, v17.4s\n"
+      "fmax v14.4s, v14.4s, v17.4s\n"
+      "fmax v8.4s, v8.4s, v17.4s\n"
+      "fmax v9.4s, v9.4s, v17.4s\n"
+      "fmax v10.4s, v10.4s, v17.4s\n"
+      "fmax v11.4s, v11.4s, v17.4s\n"
       "60:"  // Height 2: No activation
       "cmp x11, #0x10\n"
       "bge 69f\n"
@@ -912,13 +911,13 @@
       "86:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 87f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 88f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -927,8 +926,8 @@
       "b 88f\n"
       "87:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "88:"  // Height 3: input setup done
       "cmp x27, #0x4\n"
       "blt 91f\n"
@@ -946,34 +945,34 @@
       "sub x27, x27, #0x4\n"
       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
       ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q26, [x10, #0x20]\n"
       ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
       ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
       "cmp x27, #0x8\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "ld1 { v1.4s }, [x25], #0x10\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
       "ld1 { v0.4s }, [x26], #0x10\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
       "ld1 { v2.4s }, [x24], #0x10\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 89b\n"
@@ -984,30 +983,30 @@
       "sub x27, x27, #0x4\n"
       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
       ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q26, [x10, #0x20]\n"
       ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
       ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
       "91:"  // Height 3: Multiply loop: Main loop skip
       "cbz x27, 94f\n"
       "cbz x27, 94f\n"
@@ -1025,34 +1024,34 @@
       "ldr s1, [x25, #0x0]\n"
       "ldr s2, [x24, #0x0]\n"
       "93:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x6e5aec08  // bfmmla v8.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec50  // bfmmla v16.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ec0c  // bfmmla v12.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec54  // bfmmla v20.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
       "94:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1078,33 +1077,33 @@
       "uzp1 v19.2d, v19.2d, v23.2d\n"
       "tbz %x[flags], #1, 95f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v6.4s, v6.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmax v6.4s, v6.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "fmin v6.4s, v6.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmax v6.4s, v6.4s, v25.4s\n"
+      "fmax v12.4s, v12.4s, v25.4s\n"
+      "fmax v13.4s, v13.4s, v25.4s\n"
+      "fmax v14.4s, v14.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v25.4s\n"
+      "fmax v9.4s, v9.4s, v25.4s\n"
+      "fmax v10.4s, v10.4s, v25.4s\n"
+      "fmax v11.4s, v11.4s, v25.4s\n"
+      "fmax v16.4s, v16.4s, v25.4s\n"
+      "fmax v17.4s, v17.4s, v25.4s\n"
+      "fmax v18.4s, v18.4s, v25.4s\n"
+      "fmax v19.4s, v19.4s, v25.4s\n"
       "95:"  // Height 3: No activation
       "cmp x11, #0x10\n"
       "bge 104f\n"
@@ -1401,14 +1400,14 @@
       "121:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 122f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 123f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -1418,9 +1417,9 @@
       "b 123f\n"
       "122:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "123:"  // Height 4: input setup done
       "cmp x27, #0x4\n"
       "blt 126f\n"
@@ -1442,34 +1441,34 @@
       ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q26, [x10, #0x20]\n"
       ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "ld1 { v1.4s }, [x25], #0x10\n"
       ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "ld1 { v3.4s }, [x23], #0x10\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
       "ld1 { v0.4s }, [x26], #0x10\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
       "ld1 { v2.4s }, [x24], #0x10\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 124b\n"
@@ -1483,29 +1482,29 @@
       ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q26, [x10, #0x20]\n"
       ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
       "126:"  // Height 4: Multiply loop: Main loop skip
       "cbz x27, 129f\n"
       "cbz x27, 129f\n"
@@ -1526,35 +1525,35 @@
       "ldr s2, [x24, #0x0]\n"
       "ldr s3, [x23, #0x0]\n"
       "128:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
       ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e5aec08  // bfmmla v8.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec50  // bfmmla v16.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e59ec0c  // bfmmla v12.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec54  // bfmmla v20.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e5aec09  // bfmmla v9.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec51  // bfmmla v17.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e59ec0d  // bfmmla v13.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec55  // bfmmla v21.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e5aec0a  // bfmmla v10.4s, v0.8h, v26.8h\n"
+      ".inst 0x6e5aec52  // bfmmla v18.4s, v2.8h, v26.8h\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e59ec0e  // bfmmla v14.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec56  // bfmmla v22.4s, v2.8h, v25.8h\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e5aec0b  // bfmmla v11.4s, v0.8h, v26.8h\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e5aec53  // bfmmla v19.4s, v2.8h, v26.8h\n"
+      ".inst 0x6e59ec0f  // bfmmla v15.4s, v0.8h, v25.8h\n"
+      ".inst 0x6e59ec57  // bfmmla v23.4s, v2.8h, v25.8h\n"
       "129:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1586,41 +1585,41 @@
       "uzp2 v19.2d, v19.2d, v23.2d\n"
       "tbz %x[flags], #1, 130f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v0.4s }, [x20]\n"
-      "fmin v6.4s, v6.4s, v1.4s\n"
-      "fmin v12.4s, v12.4s, v1.4s\n"
-      "fmin v13.4s, v13.4s, v1.4s\n"
-      "fmin v14.4s, v14.4s, v1.4s\n"
-      "fmin v8.4s, v8.4s, v1.4s\n"
-      "fmin v9.4s, v9.4s, v1.4s\n"
-      "fmin v10.4s, v10.4s, v1.4s\n"
-      "fmin v11.4s, v11.4s, v1.4s\n"
-      "fmin v15.4s, v15.4s, v1.4s\n"
-      "fmin v20.4s, v20.4s, v1.4s\n"
-      "fmin v21.4s, v21.4s, v1.4s\n"
-      "fmin v22.4s, v22.4s, v1.4s\n"
-      "fmin v16.4s, v16.4s, v1.4s\n"
-      "fmin v17.4s, v17.4s, v1.4s\n"
-      "fmin v18.4s, v18.4s, v1.4s\n"
-      "fmin v19.4s, v19.4s, v1.4s\n"
-      "fmax v6.4s, v6.4s, v0.4s\n"
-      "fmax v12.4s, v12.4s, v0.4s\n"
-      "fmax v13.4s, v13.4s, v0.4s\n"
-      "fmax v14.4s, v14.4s, v0.4s\n"
-      "fmax v8.4s, v8.4s, v0.4s\n"
-      "fmax v9.4s, v9.4s, v0.4s\n"
-      "fmax v10.4s, v10.4s, v0.4s\n"
-      "fmax v11.4s, v11.4s, v0.4s\n"
-      "fmax v15.4s, v15.4s, v0.4s\n"
-      "fmax v20.4s, v20.4s, v0.4s\n"
-      "fmax v21.4s, v21.4s, v0.4s\n"
-      "fmax v22.4s, v22.4s, v0.4s\n"
-      "fmax v16.4s, v16.4s, v0.4s\n"
-      "fmax v17.4s, v17.4s, v0.4s\n"
-      "fmax v18.4s, v18.4s, v0.4s\n"
-      "fmax v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v25.4s }, [x20]\n"
+      "fmin v6.4s, v6.4s, v26.4s\n"
+      "fmin v12.4s, v12.4s, v26.4s\n"
+      "fmin v13.4s, v13.4s, v26.4s\n"
+      "fmin v14.4s, v14.4s, v26.4s\n"
+      "fmin v8.4s, v8.4s, v26.4s\n"
+      "fmin v9.4s, v9.4s, v26.4s\n"
+      "fmin v10.4s, v10.4s, v26.4s\n"
+      "fmin v11.4s, v11.4s, v26.4s\n"
+      "fmin v15.4s, v15.4s, v26.4s\n"
+      "fmin v20.4s, v20.4s, v26.4s\n"
+      "fmin v21.4s, v21.4s, v26.4s\n"
+      "fmin v22.4s, v22.4s, v26.4s\n"
+      "fmin v16.4s, v16.4s, v26.4s\n"
+      "fmin v17.4s, v17.4s, v26.4s\n"
+      "fmin v18.4s, v18.4s, v26.4s\n"
+      "fmin v19.4s, v19.4s, v26.4s\n"
+      "fmax v6.4s, v6.4s, v25.4s\n"
+      "fmax v12.4s, v12.4s, v25.4s\n"
+      "fmax v13.4s, v13.4s, v25.4s\n"
+      "fmax v14.4s, v14.4s, v25.4s\n"
+      "fmax v8.4s, v8.4s, v25.4s\n"
+      "fmax v9.4s, v9.4s, v25.4s\n"
+      "fmax v10.4s, v10.4s, v25.4s\n"
+      "fmax v11.4s, v11.4s, v25.4s\n"
+      "fmax v15.4s, v15.4s, v25.4s\n"
+      "fmax v20.4s, v20.4s, v25.4s\n"
+      "fmax v21.4s, v21.4s, v25.4s\n"
+      "fmax v22.4s, v22.4s, v25.4s\n"
+      "fmax v16.4s, v16.4s, v25.4s\n"
+      "fmax v17.4s, v17.4s, v25.4s\n"
+      "fmax v18.4s, v18.4s, v25.4s\n"
+      "fmax v19.4s, v19.4s, v25.4s\n"
       "130:"  // Height 4: No activation
       "cmp x11, #0x10\n"
       "bge 139f\n"
@@ -1982,15 +1981,15 @@
       "156:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 157f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 158f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -2001,10 +2000,10 @@
       "b 158f\n"
       "157:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "158:"  // Height 5: input setup done
       "cmp x27, #0x4\n"
       "blt 161f\n"
@@ -2029,43 +2028,43 @@
       ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
       ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
       ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q3, [x10, #0x20]\n"
       ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
       ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "ld1 { v1.4s }, [x25], #0x10\n"
       ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q5, [x10, #0x30]\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e43ec51  // bfmmla v17.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec99  // bfmmla v25.4s, v4.8h, v3.8h\n"
       "ldr q6, [x10, #0x40]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "ld1 { v3.4s }, [x23], #0x10\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec55  // bfmmla v21.4s, v2.8h, v5.8h\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e45ec9d  // bfmmla v29.4s, v4.8h, v5.8h\n"
+      "ldr q5, [x10, #0x50]\n"
       ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
       ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
       ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
       "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec56  // bfmmla v22.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e45ec9e  // bfmmla v30.4s, v4.8h, v5.8h\n"
+      "ldr q5, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
       ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
       ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e45ec0f  // bfmmla v15.4s, v0.8h, v5.8h\n"
       "ld1 { v0.4s }, [x26], #0x10\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
       "ld1 { v2.4s }, [x24], #0x10\n"
-      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e45ec9f  // bfmmla v31.4s, v4.8h, v5.8h\n"
       "ld1 { v4.4s }, [x22], #0x10\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 159b\n"
@@ -2081,37 +2080,37 @@
       ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
       ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
       ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q3, [x10, #0x20]\n"
       ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
       ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x6e43ec51  // bfmmla v17.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec99  // bfmmla v25.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x40]\n"
+      ".inst 0x6e41ec0d  // bfmmla v13.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec55  // bfmmla v21.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9d  // bfmmla v29.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec52  // bfmmla v18.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9a  // bfmmla v26.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x60]\n"
+      ".inst 0x6e41ec0e  // bfmmla v14.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec56  // bfmmla v22.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9e  // bfmmla v30.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec53  // bfmmla v19.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec57  // bfmmla v23.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9f  // bfmmla v31.4s, v4.8h, v1.8h\n"
       "161:"  // Height 5: Multiply loop: Main loop skip
       "cbz x27, 164f\n"
       "cbz x27, 164f\n"
@@ -2136,7 +2135,7 @@
       "ldr s4, [x22, #0x0]\n"
       "163:"  // Height 5: Multiply loop: Ragged operand read: Done
       "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
+      "ldr q5, [x10, #0x10]\n"
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
@@ -2145,34 +2144,34 @@
       ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
       ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
       ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      "ldr q3, [x10, #0x20]\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec54  // bfmmla v20.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e45ec9c  // bfmmla v28.4s, v4.8h, v5.8h\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec51  // bfmmla v17.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec99  // bfmmla v25.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x40]\n"
+      ".inst 0x6e41ec0d  // bfmmla v13.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec55  // bfmmla v21.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9d  // bfmmla v29.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec52  // bfmmla v18.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9a  // bfmmla v26.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x60]\n"
+      ".inst 0x6e41ec0e  // bfmmla v14.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec56  // bfmmla v22.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9e  // bfmmla v30.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x70]\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e43ec53  // bfmmla v19.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec57  // bfmmla v23.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9f  // bfmmla v31.4s, v4.8h, v1.8h\n"
       "164:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -2658,16 +2657,16 @@
       "191:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 192f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 193f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -2679,11 +2678,11 @@
       "b 193f\n"
       "192:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "193:"  // Height 6: input setup done
       "cmp x27, #0x4\n"
       "blt 196f\n"
@@ -2716,7 +2715,7 @@
       ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
       ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
       ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q5, [x10, #0x30]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
       ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
@@ -2724,10 +2723,10 @@
       "ld1 { v3.4s }, [x23], #0x10\n"
       ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
       "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e45ec55  // bfmmla v21.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e45ec9d  // bfmmla v29.4s, v4.8h, v5.8h\n"
       "ldr q7, [x10, #0x50]\n"
       "prfm pldl1keep, [x21, #0x80]\n"
       "ld1 { v5.4s }, [x21], #0x10\n"
@@ -2766,37 +2765,37 @@
       ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q3, [x10, #0x20]\n"
       ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
       ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q1, [x10, #0x30]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec51  // bfmmla v17.4s, v2.8h, v3.8h\n"
       "prfm pldl1keep, [x21, #0x80]\n"
-      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x6e43ec99  // bfmmla v25.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x40]\n"
+      ".inst 0x6e41ec0d  // bfmmla v13.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec55  // bfmmla v21.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9d  // bfmmla v29.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec52  // bfmmla v18.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9a  // bfmmla v26.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x60]\n"
+      ".inst 0x6e41ec0e  // bfmmla v14.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec56  // bfmmla v22.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9e  // bfmmla v30.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec53  // bfmmla v19.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec57  // bfmmla v23.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9f  // bfmmla v31.4s, v4.8h, v1.8h\n"
       "196:"  // Height 6: Multiply loop: Main loop skip
       "cbz x27, 199f\n"
       "cbz x27, 199f\n"
@@ -2823,45 +2822,45 @@
       "ldr s4, [x22, #0x0]\n"
       "ldr s5, [x21, #0x0]\n"
       "198:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
       ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
       ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
       ".inst 0x4ea168a4  // bfcvtn2 v4.8h, v5.4s\n"
-      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q3, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec51  // bfmmla v17.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec99  // bfmmla v25.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x40]\n"
+      ".inst 0x6e41ec0d  // bfmmla v13.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec55  // bfmmla v21.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9d  // bfmmla v29.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e43ec0a  // bfmmla v10.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec52  // bfmmla v18.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9a  // bfmmla v26.4s, v4.8h, v3.8h\n"
+      "ldr q3, [x10, #0x60]\n"
+      ".inst 0x6e41ec0e  // bfmmla v14.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec56  // bfmmla v22.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9e  // bfmmla v30.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
-      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e43ec53  // bfmmla v19.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e41ec0f  // bfmmla v15.4s, v0.8h, v1.8h\n"
+      ".inst 0x6e41ec57  // bfmmla v23.4s, v2.8h, v1.8h\n"
+      ".inst 0x6e41ec9f  // bfmmla v31.4s, v4.8h, v1.8h\n"
       "199:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -3126,7 +3125,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "212:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
index 905a602..bfc9c7e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -108,5 +108,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
index b31b805..eac0e71 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
@@ -78,329 +78,328 @@
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x4\n"
       "bge 91f\n"
       "cmp %x[M], #0x2\n"
       "bgt 61f\n"
       "beq 31f\n"
-      "mov x16, %x[col_bias]\n"
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v15.16b, #0x1\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
-      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x14, %x[output_ptr]\n"
-      "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "2:"  // Height 1: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
       "movi v18.4s, #0x0\n"
       "movi v19.4s, #0x0\n"
       "3:"  // Height 1: setup done
-      "mov x12, #0x0\n"
+      "mov x11, #0x0\n"
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w11, [x20, x12, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x10, [x21, #0x0]\n"
-      "cbnz x12, 6f\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "cbnz x11, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x10, x10, x20\n"
+      "add x9, x9, x20\n"
       "b 6f\n"
       "5:"  // Height 1: setup direct input
-      "mov x10, %x[input_ptr]\n"
+      "mov x9, %x[input_ptr]\n"
       "6:"  // Height 1: input setup done
-      "cmp x11, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 11f\n"
-      "ldr q0, [x10, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q4, [x13, #0x0]\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 9f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr d4, [x13, #0x70]\n"
-      "ldr x9, [x13, #0x78]\n"
+      "ldr d21, [x12, #0x70]\n"
+      "ldr x20, [x12, #0x78]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr d5, [x13, #0x80]\n"
+      "ldr d20, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr d6, [x13, #0x90]\n"
+      "ldr d26, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr d7, [x13, #0xa0]\n"
-      "mov v4.d[1], x9\n"
-      "ldr x28, [x13, #0x88]\n"
+      "ldr d25, [x12, #0xa0]\n"
+      "mov v21.d[1], x20\n"
+      "ldr x20, [x12, #0x88]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr d8, [x13, #0xb0]\n"
+      "ldr d24, [x12, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr d9, [x13, #0xc0]\n"
+      "ldr d23, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr d10, [x13, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      "ldr d4, [x13, #0xe0]\n"
-      "mov v5.d[1], x28\n"
-      "ldr x27, [x13, #0x98]\n"
-      "mov v6.d[1], x27\n"
-      "ldr x26, [x13, #0xa8]\n"
-      "mov v7.d[1], x26\n"
-      "ldr x25, [x13, #0xb8]\n"
-      "mov v8.d[1], x25\n"
-      "ldr x24, [x13, #0xc8]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr d5, [x13, #0xf0]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "ldr x20, [x13, #0xd8]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      "ldr x9, [x13, #0xe8]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      "ldr x28, [x13, #0xf8]\n"
-      "mov v9.d[1], x24\n"
-      "mov v10.d[1], x20\n"
-      "add x10, x10, #0x10\n"
-      "mov v4.d[1], x9\n"
-      "add x13, x13, #0x100\n"
-      "mov v5.d[1], x28\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      "ldr d22, [x12, #0xd0]\n"
+      ".inst 0x4fa0e2b3  // sdot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr d21, [x12, #0xe0]\n"
+      "mov v20.d[1], x20\n"
+      "ldr x20, [x12, #0x98]\n"
+      "mov v26.d[1], x20\n"
+      "ldr x20, [x12, #0xa8]\n"
+      "mov v25.d[1], x20\n"
+      "ldr x20, [x12, #0xb8]\n"
+      "mov v24.d[1], x20\n"
+      "ldr x23, [x12, #0xc8]\n"
+      ".inst 0x4f80ea90  // sdot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr d20, [x12, #0xf0]\n"
+      ".inst 0x4f80eb51  // sdot v17.4s, v26.16b, v0.4b[2]\n"
+      "ldr x22, [x12, #0xd8]\n"
+      ".inst 0x4f80eb32  // sdot v18.4s, v25.16b, v0.4b[2]\n"
+      "ldr x21, [x12, #0xe8]\n"
+      ".inst 0x4f80eb13  // sdot v19.4s, v24.16b, v0.4b[2]\n"
+      "ldr x20, [x12, #0xf8]\n"
+      "mov v23.d[1], x23\n"
+      "mov v22.d[1], x22\n"
+      "add x9, x9, #0x10\n"
+      "mov v21.d[1], x21\n"
+      "add x12, x12, #0x100\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4fa0eaf0  // sdot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ead1  // sdot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eab2  // sdot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea93  // sdot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 8f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "8:"  // Height 1: Multiply loop: unique 1: skip row sum
-      "ldr q0, [x10, #0x0]\n"
-      "sub x11, x11, #0x10\n"
-      "ldr q4, [x13, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
-      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q4, [x12, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "bge 7b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q4, [x13, #0x70]\n"
+      "ldr q21, [x12, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q5, [x13, #0x80]\n"
+      "ldr q20, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x13, #0x90]\n"
+      "ldr q26, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x13, #0xa0]\n"
+      "ldr q25, [x12, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q8, [x13, #0xb0]\n"
+      "ldr q24, [x12, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr q9, [x13, #0xc0]\n"
+      "ldr q23, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr q10, [x13, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      "ldr q4, [x13, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr q5, [x13, #0xf0]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "sub x11, x11, #0x10\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      "add x10, x10, #0x10\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      "add x13, x13, #0x100\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      "ldr q22, [x12, #0xd0]\n"
+      ".inst 0x4fa0e2b3  // sdot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr q21, [x12, #0xe0]\n"
+      ".inst 0x4f80ea90  // sdot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr q20, [x12, #0xf0]\n"
+      ".inst 0x4f80eb51  // sdot v17.4s, v26.16b, v0.4b[2]\n"
+      "sub x10, x10, #0x10\n"
+      ".inst 0x4f80eb32  // sdot v18.4s, v25.16b, v0.4b[2]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0x4f80eb13  // sdot v19.4s, v24.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x4fa0eaf0  // sdot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ead1  // sdot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eab2  // sdot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea93  // sdot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 10f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "10:"  // Height 1: Multiply loop: unique 2: skip row sum
-      "prfm pldl1keep, [x10, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "11:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x11, 18f\n"
-      "cmp x11, #0x4\n"
+      "cbz x10, 18f\n"
+      "cmp x10, #0x4\n"
       "blt 14f\n"
       "12:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x10], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
       "tbnz %x[flags], #31, 13f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "13:"  // Height 1: Multiply loop: unique 3: skip row sum
-      "ldr q6, [x13, #0x0]\n"
-      "sub x11, x11, #0x4\n"
-      "ldr q7, [x13, #0x10]\n"
-      "cmp x11, #0x4\n"
-      "ldr q8, [x13, #0x20]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x13, #0x30]\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      "add x13, x13, #0x40\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q22, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q21, [x12, #0x20]\n"
+      ".inst 0x4f80e290  // sdot v16.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x30]\n"
+      ".inst 0x4f80e2d1  // sdot v17.4s, v22.16b, v0.4b[0]\n"
+      ".inst 0x4f80e2b2  // sdot v18.4s, v21.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x4f80e293  // sdot v19.4s, v20.16b, v0.4b[0]\n"
       "bge 12b\n"
       "14:"  // Height 1: Multiply loop: Skip odd blocks
-      "cbz x11, 18f\n"
-      "tbz x11, #1, 15f\n"
-      "ldr h0, [x10], #0x2\n"
-      "tbz x11, #0, 16f\n"
-      "ld1 { v0.b }[2], [x10]\n"
+      "cbz x10, 18f\n"
+      "tbz x10, #1, 15f\n"
+      "ldr h0, [x9], #0x2\n"
+      "tbz x10, #0, 16f\n"
+      "ld1 { v0.b }[2], [x9]\n"
       "b 16f\n"
       "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x10, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
       "16:"  // Height 1: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 17f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "17:"  // Height 1: Multiply loop: unique 4: skip row sum
-      "ldr q10, [x13, #0x0]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x13, #0x10]\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x13, #0x20]\n"
-      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
-      "ldr q6, [x13, #0x30]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
-      "add x13, x13, #0x40\n"
+      "ldr q20, [x12, #0x0]\n"
+      ".inst 0x4f80e290  // sdot v16.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x10]\n"
+      ".inst 0x4f80e291  // sdot v17.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x20]\n"
+      ".inst 0x4f80e292  // sdot v18.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x30]\n"
+      ".inst 0x4f80e293  // sdot v19.4s, v20.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
       "18:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x12, x12, #0x1\n"
-      "cmp x12, x20\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 4b\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
       "tbnz %x[flags], #31, 19f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v1.4s }, [x23]\n"
-      "neg v1.4s, v1.4s\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "neg v20.4s, v20.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "mul v11.4s, v11.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v20.4s\n"
       "19:"  // Height 1: skip row sum fixup
-      "ldr q0, [x16, #0x0]\n"
+      "ldr q23, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q1, [x16, #0x10]\n"
+      "ldr q22, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x16, #0x20]\n"
+      "ldr q21, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q3, [x16, #0x30]\n"
+      "ldr q20, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add v16.4s, v16.4s, v23.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v20.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "add x16, x16, #0x40\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v20.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v20.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 20f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "and v22.16b, v17.16b, v0.16b\n"
+      "and v21.16b, v18.16b, v0.16b\n"
+      "and v20.16b, v19.16b, v0.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "sqadd v17.4s, v17.4s, v22.4s\n"
+      "sqadd v18.4s, v18.4s, v21.4s\n"
+      "sqadd v19.4s, v19.4s, v20.4s\n"
       "20:"  // Height 1: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v20.4s\n"
+      "add v17.4s, v17.4s, v20.4s\n"
+      "add v18.4s, v18.4s, v20.4s\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v20.4s\n"
+      "smin v17.4s, v17.4s, v20.4s\n"
+      "smin v18.4s, v18.4s, v20.4s\n"
+      "smin v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v20.4s\n"
+      "smax v17.4s, v17.4s, v20.4s\n"
+      "smax v18.4s, v18.4s, v20.4s\n"
+      "smax v19.4s, v19.4s, v20.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "cmp x15, #0x10\n"
+      "cmp x14, #0x10\n"
       "uzp1 v16.16b, v16.16b, v17.16b\n"
       "bge 29f\n"
-      "tbz x15, #3, 24f\n"
-      "str d16, [x14], #0x8\n"
-      "tbz x15, #2, 22f\n"
-      "st1 { v16.s }[2], [x14], #0x4\n"
-      "tbz x15, #1, 21f\n"
-      "st1 { v16.h }[6], [x14], #0x2\n"
-      "tbz x15, #0, 28f\n"
-      "st1 { v16.b }[14], [x14]\n"
+      "tbz x14, #3, 24f\n"
+      "str d16, [x13], #0x8\n"
+      "tbz x14, #2, 22f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "tbz x14, #1, 21f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[14], [x13]\n"
       "b 28f\n"
       "21:"  // Height 1: Partial direct writeback: partial_1_12
-      "tbz x15, #0, 28f\n"
-      "st1 { v16.b }[12], [x14]\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[12], [x13]\n"
       "b 28f\n"
       "22:"  // Height 1: Partial direct writeback: partial_2_8
-      "tbz x15, #1, 23f\n"
-      "st1 { v16.h }[4], [x14], #0x2\n"
-      "tbz x15, #0, 28f\n"
-      "st1 { v16.b }[10], [x14]\n"
+      "tbz x14, #1, 23f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[10], [x13]\n"
       "b 28f\n"
       "23:"  // Height 1: Partial direct writeback: partial_1_8
-      "tbz x15, #0, 28f\n"
-      "st1 { v16.b }[8], [x14]\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[8], [x13]\n"
       "b 28f\n"
       "24:"  // Height 1: Partial direct writeback: partial_4_0
-      "tbz x15, #2, 26f\n"
-      "str s16, [x14], #0x4\n"
-      "tbz x15, #1, 25f\n"
-      "st1 { v16.h }[2], [x14], #0x2\n"
-      "tbz x15, #0, 28f\n"
-      "st1 { v16.b }[6], [x14]\n"
+      "tbz x14, #2, 26f\n"
+      "str s16, [x13], #0x4\n"
+      "tbz x14, #1, 25f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[6], [x13]\n"
       "b 28f\n"
       "25:"  // Height 1: Partial direct writeback: partial_1_4
-      "tbz x15, #0, 28f\n"
-      "st1 { v16.b }[4], [x14]\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[4], [x13]\n"
       "b 28f\n"
       "26:"  // Height 1: Partial direct writeback: partial_2_0
-      "tbz x15, #1, 27f\n"
-      "str h16, [x14], #0x2\n"
-      "tbz x15, #0, 28f\n"
-      "st1 { v16.b }[2], [x14]\n"
+      "tbz x14, #1, 27f\n"
+      "str h16, [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[2], [x13]\n"
       "b 28f\n"
       "27:"  // Height 1: Partial direct writeback: partial_1_0
-      "str b16, [x14, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
       "28:"  // Height 1: Partial direct writeback: Done
       "b 30f\n"
       "29:"  // Height 1: Full writeback
-      "str q16, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
       "30:"  // Height 1: Writeback done
-      "subs x15, x15, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 2b\n"
       "b 122f\n"
       "31:"  // Height 2
-      "mov x16, %x[col_bias]\n"
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v12.4s, #0x0\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v15.16b, #0x1\n"
-      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
       "32:"  // Height 2: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -411,307 +410,307 @@
       "movi v22.4s, #0x0\n"
       "movi v23.4s, #0x0\n"
       "33:"  // Height 2: setup done
-      "mov x12, #0x0\n"
+      "mov x11, #0x0\n"
       "34:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w11, [x20, x12, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 35f\n"
-      "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x10, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "cbnz x12, 36f\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x11, 36f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x10, x10, x20\n"
-      "add x23, x23, x20\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
       "b 36f\n"
       "35:"  // Height 2: setup direct input
-      "mov x10, %x[input_ptr]\n"
-      "add x23, x10, x20\n"
+      "mov x9, %x[input_ptr]\n"
+      "add x28, x9, x21\n"
       "36:"  // Height 2: input setup done
-      "cmp x11, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 41f\n"
-      "ldr q0, [x10, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q1, [x23, #0x0]\n"
-      "ldr q4, [x13, #0x0]\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 39f\n"
       "37:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr x9, [x13, #0x78]\n"
+      "ldr x20, [x12, #0x78]\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr d4, [x13, #0x70]\n"
+      "ldr d25, [x12, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "mov v4.d[1], x9\n"
+      "mov v25.d[1], x20\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr d5, [x13, #0x80]\n"
+      "ldr d24, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr x28, [x13, #0x88]\n"
+      "ldr x23, [x12, #0x88]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr d6, [x13, #0x90]\n"
+      "ldr d30, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr x27, [x13, #0x98]\n"
+      "ldr x22, [x12, #0x98]\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr d7, [x13, #0xa0]\n"
-      "ldr x26, [x13, #0xa8]\n"
+      "ldr d29, [x12, #0xa0]\n"
+      "ldr x21, [x12, #0xa8]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr d8, [x13, #0xb0]\n"
-      "ldr x25, [x13, #0xb8]\n"
+      "ldr d28, [x12, #0xb0]\n"
+      "ldr x20, [x12, #0xb8]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr d9, [x13, #0xc0]\n"
+      "ldr d27, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      "mov v5.d[1], x28\n"
+      "mov v24.d[1], x23\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr d10, [x13, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      "mov v6.d[1], x27\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr d4, [x13, #0xe0]\n"
-      "mov v7.d[1], x26\n"
-      "ldr x24, [x13, #0xc8]\n"
-      "mov v8.d[1], x25\n"
-      "ldr x20, [x13, #0xd8]\n"
-      "ldr x9, [x13, #0xe8]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr d5, [x13, #0xf0]\n"
-      "ldr x28, [x13, #0xf8]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      "mov v9.d[1], x24\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      "mov v10.d[1], x20\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      "mov v4.d[1], x9\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      "mov v5.d[1], x28\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      "add x10, x10, #0x10\n"
-      "add x23, x23, #0x10\n"
-      "add x13, x13, #0x100\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
+      "ldr d26, [x12, #0xd0]\n"
+      ".inst 0x4fa0e333  // sdot v19.4s, v25.16b, v0.4b[1]\n"
+      "mov v30.d[1], x22\n"
+      ".inst 0x4fa1e337  // sdot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr d25, [x12, #0xe0]\n"
+      "mov v29.d[1], x21\n"
+      "ldr x23, [x12, #0xc8]\n"
+      "mov v28.d[1], x20\n"
+      "ldr x22, [x12, #0xd8]\n"
+      "ldr x21, [x12, #0xe8]\n"
+      ".inst 0x4f80eb10  // sdot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb14  // sdot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr d24, [x12, #0xf0]\n"
+      "ldr x20, [x12, #0xf8]\n"
+      ".inst 0x4f80ebd1  // sdot v17.4s, v30.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebd5  // sdot v21.4s, v30.16b, v1.4b[2]\n"
+      "mov v27.d[1], x23\n"
+      ".inst 0x4f80ebb2  // sdot v18.4s, v29.16b, v0.4b[2]\n"
+      "mov v26.d[1], x22\n"
+      ".inst 0x4f81ebb6  // sdot v22.4s, v29.16b, v1.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4f80eb93  // sdot v19.4s, v28.16b, v0.4b[2]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4f81eb97  // sdot v23.4s, v28.16b, v1.4b[2]\n"
+      "add x9, x9, #0x10\n"
+      "add x28, x28, #0x10\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x4fa0eb70  // sdot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb74  // sdot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb51  // sdot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb55  // sdot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb32  // sdot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb36  // sdot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb13  // sdot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb17  // sdot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 38f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       "38:"  // Height 2: Multiply loop: unique 5: skip row sum
-      "ldr q0, [x10, #0x0]\n"
-      "sub x11, x11, #0x10\n"
-      "ldr q1, [x23, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q4, [x13, #0x0]\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
-      "prfm pldl1keep, [x10, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q1, [x28, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "bge 37b\n"
       "39:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "sub x11, x11, #0x10\n"
+      "sub x10, x10, #0x10\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q4, [x13, #0x70]\n"
+      "ldr q25, [x12, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "add x10, x10, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q5, [x13, #0x80]\n"
+      "ldr q24, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x13, #0x90]\n"
+      "ldr q30, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x13, #0xa0]\n"
+      "ldr q29, [x12, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr q8, [x13, #0xb0]\n"
+      "ldr q28, [x12, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr q9, [x13, #0xc0]\n"
+      "ldr q27, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr q10, [x13, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr q4, [x13, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr q5, [x13, #0xf0]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "add x13, x13, #0x100\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
+      "ldr q26, [x12, #0xd0]\n"
+      ".inst 0x4fa0e333  // sdot v19.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e337  // sdot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr q25, [x12, #0xe0]\n"
+      ".inst 0x4f80eb10  // sdot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb14  // sdot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr q24, [x12, #0xf0]\n"
+      ".inst 0x4f80ebd1  // sdot v17.4s, v30.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x4f81ebd5  // sdot v21.4s, v30.16b, v1.4b[2]\n"
+      ".inst 0x4f80ebb2  // sdot v18.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebb6  // sdot v22.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f80eb93  // sdot v19.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb97  // sdot v23.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4fa0eb70  // sdot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb74  // sdot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb51  // sdot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb55  // sdot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb32  // sdot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb36  // sdot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb13  // sdot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb17  // sdot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 40f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       "40:"  // Height 2: Multiply loop: unique 6: skip row sum
-      "prfm pldl1keep, [x10, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "41:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x11, 48f\n"
-      "cmp x11, #0x4\n"
+      "cbz x10, 48f\n"
+      "cmp x10, #0x4\n"
       "blt 44f\n"
       "42:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x10], #0x4\n"
-      "ldr s1, [x23], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
       "tbnz %x[flags], #31, 43f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       "43:"  // Height 2: Multiply loop: unique 7: skip row sum
-      "ldr q6, [x13, #0x0]\n"
-      "sub x11, x11, #0x4\n"
-      "ldr q7, [x13, #0x10]\n"
-      "cmp x11, #0x4\n"
-      "ldr q8, [x13, #0x20]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x13, #0x30]\n"
-      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      "add x13, x13, #0x40\n"
-      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
+      "ldr q27, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q26, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q25, [x12, #0x20]\n"
+      ".inst 0x4f80e370  // sdot v16.4s, v27.16b, v0.4b[0]\n"
+      "ldr q24, [x12, #0x30]\n"
+      ".inst 0x4f81e374  // sdot v20.4s, v27.16b, v1.4b[0]\n"
+      ".inst 0x4f80e351  // sdot v17.4s, v26.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x4f81e355  // sdot v21.4s, v26.16b, v1.4b[0]\n"
+      ".inst 0x4f80e332  // sdot v18.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e336  // sdot v22.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f80e313  // sdot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e317  // sdot v23.4s, v24.16b, v1.4b[0]\n"
       "bge 42b\n"
       "44:"  // Height 2: Multiply loop: Skip odd blocks
-      "cbz x11, 48f\n"
-      "tbz x11, #1, 45f\n"
-      "ldr h0, [x10], #0x2\n"
-      "ldr h1, [x23], #0x2\n"
-      "tbz x11, #0, 46f\n"
-      "ld1 { v0.b }[2], [x10]\n"
-      "ld1 { v1.b }[2], [x23]\n"
+      "cbz x10, 48f\n"
+      "tbz x10, #1, 45f\n"
+      "ldr h0, [x9], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "tbz x10, #0, 46f\n"
+      "ld1 { v0.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x28]\n"
       "b 46f\n"
       "45:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x10, #0x0]\n"
-      "ldr b1, [x23, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
       "46:"  // Height 2: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 47f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       "47:"  // Height 2: Multiply loop: unique 8: skip row sum
-      "ldr q10, [x13, #0x0]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x13, #0x10]\n"
-      ".inst 0x4f81e154  // sdot v20.4s, v10.16b, v1.4b[0]\n"
-      "ldr q5, [x13, #0x20]\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      "ldr q6, [x13, #0x30]\n"
-      ".inst 0x4f81e095  // sdot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
-      "add x13, x13, #0x40\n"
-      ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d7  // sdot v23.4s, v6.16b, v1.4b[0]\n"
+      "ldr q24, [x12, #0x0]\n"
+      ".inst 0x4f80e310  // sdot v16.4s, v24.16b, v0.4b[0]\n"
+      "ldr q26, [x12, #0x10]\n"
+      ".inst 0x4f81e314  // sdot v20.4s, v24.16b, v1.4b[0]\n"
+      "ldr q25, [x12, #0x20]\n"
+      ".inst 0x4f80e351  // sdot v17.4s, v26.16b, v0.4b[0]\n"
+      "ldr q24, [x12, #0x30]\n"
+      ".inst 0x4f81e355  // sdot v21.4s, v26.16b, v1.4b[0]\n"
+      ".inst 0x4f80e332  // sdot v18.4s, v25.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x4f81e336  // sdot v22.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f80e313  // sdot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e317  // sdot v23.4s, v24.16b, v1.4b[0]\n"
       "48:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x12, x12, #0x1\n"
-      "cmp x12, x20\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 34b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x14, x20\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x23, x13, x20\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "tbnz %x[flags], #31, 49f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v2.4s }, [x23]\n"
-      "neg v2.4s, v2.4s\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "neg v24.4s, v24.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "mul v11.4s, v11.4s, v2.4s\n"
-      "mul v12.4s, v12.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v24.4s\n"
+      "mul v12.4s, v12.4s, v24.4s\n"
       "49:"  // Height 2: skip row sum fixup
-      "ldr q0, [x16, #0x0]\n"
+      "ldr q27, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q1, [x16, #0x10]\n"
+      "ldr q26, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x16, #0x20]\n"
+      "ldr q25, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q3, [x16, #0x30]\n"
+      "ldr q24, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add v16.4s, v16.4s, v27.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v25.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "add v20.4s, v20.4s, v27.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v25.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "add x16, x16, #0x40\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v24.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 50f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v24.16b, v16.16b, v0.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v24.4s\n"
+      "and v30.16b, v17.16b, v0.16b\n"
+      "and v29.16b, v18.16b, v0.16b\n"
+      "and v28.16b, v19.16b, v0.16b\n"
+      "and v27.16b, v20.16b, v0.16b\n"
+      "and v26.16b, v21.16b, v0.16b\n"
+      "and v25.16b, v22.16b, v0.16b\n"
+      "and v24.16b, v23.16b, v0.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v30.4s\n"
+      "sqadd v18.4s, v18.4s, v29.4s\n"
+      "sqadd v19.4s, v19.4s, v28.4s\n"
+      "sqadd v20.4s, v20.4s, v27.4s\n"
+      "sqadd v21.4s, v21.4s, v26.4s\n"
+      "sqadd v22.4s, v22.4s, v25.4s\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
       "50:"  // Height 2: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
@@ -721,122 +720,122 @@
       "srshl v21.4s, v21.4s, v0.4s\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v24.4s\n"
+      "add v17.4s, v17.4s, v24.4s\n"
+      "add v18.4s, v18.4s, v24.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "add v20.4s, v20.4s, v24.4s\n"
+      "add v21.4s, v21.4s, v24.4s\n"
+      "add v22.4s, v22.4s, v24.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v24.4s\n"
+      "smin v17.4s, v17.4s, v24.4s\n"
+      "smin v18.4s, v18.4s, v24.4s\n"
+      "smin v19.4s, v19.4s, v24.4s\n"
+      "smin v20.4s, v20.4s, v24.4s\n"
+      "smin v21.4s, v21.4s, v24.4s\n"
+      "smin v22.4s, v22.4s, v24.4s\n"
+      "smin v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
-      "cmp x15, #0x10\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v17.8h, v22.8h, v23.8h\n"
+      "cmp x14, #0x10\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v20.16b, v20.16b, v17.16b\n"
       "bge 59f\n"
-      "tbz x15, #3, 54f\n"
-      "str d16, [x14], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "tbz x15, #2, 52f\n"
-      "st1 { v16.s }[2], [x14], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "tbz x15, #1, 51f\n"
-      "st1 { v16.h }[6], [x14], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
-      "tbz x15, #0, 58f\n"
-      "st1 { v16.b }[14], [x14]\n"
-      "st1 { v20.b }[14], [x22]\n"
+      "tbz x14, #3, 54f\n"
+      "str d16, [x13], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "tbz x14, #2, 52f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "tbz x14, #1, 51f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[14], [x13]\n"
+      "st1 { v20.b }[14], [x23]\n"
       "b 58f\n"
       "51:"  // Height 2: Partial direct writeback: partial_1_12
-      "tbz x15, #0, 58f\n"
-      "st1 { v16.b }[12], [x14]\n"
-      "st1 { v20.b }[12], [x22]\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[12], [x13]\n"
+      "st1 { v20.b }[12], [x23]\n"
       "b 58f\n"
       "52:"  // Height 2: Partial direct writeback: partial_2_8
-      "tbz x15, #1, 53f\n"
-      "st1 { v16.h }[4], [x14], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
-      "tbz x15, #0, 58f\n"
-      "st1 { v16.b }[10], [x14]\n"
-      "st1 { v20.b }[10], [x22]\n"
+      "tbz x14, #1, 53f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[10], [x13]\n"
+      "st1 { v20.b }[10], [x23]\n"
       "b 58f\n"
       "53:"  // Height 2: Partial direct writeback: partial_1_8
-      "tbz x15, #0, 58f\n"
-      "st1 { v16.b }[8], [x14]\n"
-      "st1 { v20.b }[8], [x22]\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[8], [x13]\n"
+      "st1 { v20.b }[8], [x23]\n"
       "b 58f\n"
       "54:"  // Height 2: Partial direct writeback: partial_4_0
-      "tbz x15, #2, 56f\n"
-      "str s16, [x14], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "tbz x15, #1, 55f\n"
-      "st1 { v16.h }[2], [x14], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
-      "tbz x15, #0, 58f\n"
-      "st1 { v16.b }[6], [x14]\n"
-      "st1 { v20.b }[6], [x22]\n"
+      "tbz x14, #2, 56f\n"
+      "str s16, [x13], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "tbz x14, #1, 55f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[6], [x13]\n"
+      "st1 { v20.b }[6], [x23]\n"
       "b 58f\n"
       "55:"  // Height 2: Partial direct writeback: partial_1_4
-      "tbz x15, #0, 58f\n"
-      "st1 { v16.b }[4], [x14]\n"
-      "st1 { v20.b }[4], [x22]\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[4], [x13]\n"
+      "st1 { v20.b }[4], [x23]\n"
       "b 58f\n"
       "56:"  // Height 2: Partial direct writeback: partial_2_0
-      "tbz x15, #1, 57f\n"
-      "str h16, [x14], #0x2\n"
-      "str h20, [x22], #0x2\n"
-      "tbz x15, #0, 58f\n"
-      "st1 { v16.b }[2], [x14]\n"
-      "st1 { v20.b }[2], [x22]\n"
+      "tbz x14, #1, 57f\n"
+      "str h16, [x13], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[2], [x13]\n"
+      "st1 { v20.b }[2], [x23]\n"
       "b 58f\n"
       "57:"  // Height 2: Partial direct writeback: partial_1_0
-      "str b16, [x14, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
       "58:"  // Height 2: Partial direct writeback: Done
       "b 60f\n"
       "59:"  // Height 2: Full writeback
-      "str q16, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q20, [x22, #0x0]\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "str q20, [x23, #0x0]\n"
       "60:"  // Height 2: Writeback done
-      "subs x15, x15, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 32b\n"
       "b 122f\n"
       "61:"  // Height 3
-      "mov x16, %x[col_bias]\n"
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v12.4s, #0x0\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v13.4s, #0x0\n"
-      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v15.16b, #0x1\n"
-      "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
       "62:"  // Height 3: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -851,317 +850,317 @@
       "movi v26.4s, #0x0\n"
       "movi v27.4s, #0x0\n"
       "63:"  // Height 3: setup done
-      "mov x12, #0x0\n"
+      "mov x11, #0x0\n"
       "64:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w11, [x20, x12, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 65f\n"
-      "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x10, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "cbnz x12, 66f\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x27, [x20, #0x10]\n"
+      "cbnz x11, 66f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x10, x10, x20\n"
-      "add x23, x23, x20\n"
-      "add x22, x22, x20\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
+      "add x27, x27, x20\n"
       "b 66f\n"
       "65:"  // Height 3: setup direct input
-      "mov x10, %x[input_ptr]\n"
-      "add x23, x10, x20\n"
-      "add x22, x23, x20\n"
+      "mov x9, %x[input_ptr]\n"
+      "add x28, x9, x21\n"
+      "add x27, x28, x21\n"
       "66:"  // Height 3: input setup done
-      "cmp x11, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 71f\n"
-      "ldr q0, [x10, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q1, [x23, #0x0]\n"
-      "ldr q2, [x22, #0x0]\n"
-      "ldr q4, [x13, #0x0]\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 69f\n"
       "67:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr x9, [x13, #0x78]\n"
+      "ldr x20, [x12, #0x78]\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr x28, [x13, #0x88]\n"
+      "ldr x23, [x12, #0x88]\n"
       ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr d4, [x13, #0x70]\n"
+      "ldr d29, [x12, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "mov v4.d[1], x9\n"
+      "mov v29.d[1], x20\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr x27, [x13, #0x98]\n"
+      "ldr x22, [x12, #0x98]\n"
       ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr d5, [x13, #0x80]\n"
+      "ldr d28, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr x26, [x13, #0xa8]\n"
+      "ldr x21, [x12, #0xa8]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr x25, [x13, #0xb8]\n"
+      "ldr x20, [x12, #0xb8]\n"
       ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr d6, [x13, #0x90]\n"
+      "ldr d5, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "mov v5.d[1], x28\n"
+      "mov v28.d[1], x23\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
-      "mov v6.d[1], x27\n"
+      "mov v5.d[1], x22\n"
       ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr d7, [x13, #0xa0]\n"
+      "ldr d4, [x12, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "mov v7.d[1], x26\n"
+      "mov v4.d[1], x21\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr x24, [x13, #0xc8]\n"
+      "ldr x23, [x12, #0xc8]\n"
       ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr d8, [x13, #0xb0]\n"
+      "ldr d3, [x12, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "mov v8.d[1], x25\n"
+      "mov v3.d[1], x20\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr x20, [x13, #0xd8]\n"
+      "ldr x22, [x12, #0xd8]\n"
       ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
-      "ldr d9, [x13, #0xc0]\n"
+      "ldr d31, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr x9, [x13, #0xe8]\n"
+      "ldr x21, [x12, #0xe8]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr x28, [x13, #0xf8]\n"
+      "ldr x20, [x12, #0xf8]\n"
       ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr d10, [x13, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      "mov v9.d[1], x24\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      "mov v10.d[1], x20\n"
-      ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
-      "ldr d4, [x13, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      "mov v4.d[1], x9\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      "add x10, x10, #0x10\n"
-      ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr d5, [x13, #0xf0]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "mov v5.d[1], x28\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
-      "add x22, x22, #0x10\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      "add x13, x13, #0x100\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x4f82e91b  // sdot v27.4s, v8.16b, v2.4b[2]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e938  // sdot v24.4s, v9.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e959  // sdot v25.4s, v10.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e89a  // sdot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8bb  // sdot v27.4s, v5.16b, v2.4b[3]\n"
+      "ldr d30, [x12, #0xd0]\n"
+      ".inst 0x4fa0e3b3  // sdot v19.4s, v29.16b, v0.4b[1]\n"
+      "mov v31.d[1], x23\n"
+      ".inst 0x4fa1e3b7  // sdot v23.4s, v29.16b, v1.4b[1]\n"
+      "mov v30.d[1], x22\n"
+      ".inst 0x4fa2e3bb  // sdot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr d29, [x12, #0xe0]\n"
+      ".inst 0x4f80eb90  // sdot v16.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4f81eb94  // sdot v20.4s, v28.16b, v1.4b[2]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0x4f82eb98  // sdot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr d28, [x12, #0xf0]\n"
+      ".inst 0x4f80e8b1  // sdot v17.4s, v5.16b, v0.4b[2]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4f81e8b5  // sdot v21.4s, v5.16b, v1.4b[2]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f82e8b9  // sdot v25.4s, v5.16b, v2.4b[2]\n"
+      "add x27, x27, #0x10\n"
+      ".inst 0x4f80e892  // sdot v18.4s, v4.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x4f81e896  // sdot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4f82e89a  // sdot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x4f80e873  // sdot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x4f81e877  // sdot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x4f82e87b  // sdot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x4fa0ebf0  // sdot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebf4  // sdot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebf8  // sdot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebd1  // sdot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebd5  // sdot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebd9  // sdot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebb2  // sdot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebb6  // sdot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebba  // sdot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa0eb93  // sdot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb97  // sdot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb9b  // sdot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 68f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "68:"  // Height 3: Multiply loop: unique 9: skip row sum
-      "ldr q0, [x10, #0x0]\n"
-      "sub x11, x11, #0x10\n"
-      "ldr q1, [x23, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q2, [x22, #0x0]\n"
-      "ldr q4, [x13, #0x0]\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
-      "prfm pldl1keep, [x10, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q1, [x28, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "bge 67b\n"
       "69:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "sub x11, x11, #0x10\n"
+      "sub x10, x10, #0x10\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "add x10, x10, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q4, [x13, #0x70]\n"
+      "ldr q29, [x12, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "add x22, x22, #0x10\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q5, [x13, #0x80]\n"
+      "ldr q28, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
       ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x13, #0x90]\n"
+      "ldr q5, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x13, #0xa0]\n"
+      "ldr q4, [x12, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
       ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr q8, [x13, #0xb0]\n"
+      "ldr q3, [x12, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
-      "ldr q9, [x13, #0xc0]\n"
+      "ldr q31, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr q10, [x13, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
-      "ldr q4, [x13, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr q5, [x13, #0xf0]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "add x13, x13, #0x100\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x4f82e91b  // sdot v27.4s, v8.16b, v2.4b[2]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e938  // sdot v24.4s, v9.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e959  // sdot v25.4s, v10.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e89a  // sdot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8bb  // sdot v27.4s, v5.16b, v2.4b[3]\n"
+      "ldr q30, [x12, #0xd0]\n"
+      ".inst 0x4fa0e3b3  // sdot v19.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3b7  // sdot v23.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3bb  // sdot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr q29, [x12, #0xe0]\n"
+      ".inst 0x4f80eb90  // sdot v16.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb94  // sdot v20.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb98  // sdot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr q28, [x12, #0xf0]\n"
+      ".inst 0x4f80e8b1  // sdot v17.4s, v5.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x4f81e8b5  // sdot v21.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8b9  // sdot v25.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x4f80e892  // sdot v18.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x4f81e896  // sdot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4f82e89a  // sdot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x4f80e873  // sdot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x4f81e877  // sdot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x4f82e87b  // sdot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x4fa0ebf0  // sdot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebf4  // sdot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebf8  // sdot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebd1  // sdot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebd5  // sdot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebd9  // sdot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebb2  // sdot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebb6  // sdot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebba  // sdot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa0eb93  // sdot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb97  // sdot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb9b  // sdot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 70f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "70:"  // Height 3: Multiply loop: unique 10: skip row sum
-      "prfm pldl1keep, [x10, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "71:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x11, 78f\n"
-      "cmp x11, #0x4\n"
+      "cbz x10, 78f\n"
+      "cmp x10, #0x4\n"
       "blt 74f\n"
       "72:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x10], #0x4\n"
-      "ldr s1, [x23], #0x4\n"
-      "ldr s2, [x22], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x27], #0x4\n"
       "tbnz %x[flags], #31, 73f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "73:"  // Height 3: Multiply loop: unique 11: skip row sum
-      "ldr q6, [x13, #0x0]\n"
-      "sub x11, x11, #0x4\n"
-      "ldr q7, [x13, #0x10]\n"
-      "cmp x11, #0x4\n"
-      "ldr q8, [x13, #0x20]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x13, #0x30]\n"
-      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d8  // sdot v24.4s, v6.16b, v2.4b[0]\n"
-      "add x13, x13, #0x40\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f9  // sdot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x4f82e11a  // sdot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x4f82e13b  // sdot v27.4s, v9.16b, v2.4b[0]\n"
+      "ldr q31, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q30, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q29, [x12, #0x20]\n"
+      ".inst 0x4f80e3f0  // sdot v16.4s, v31.16b, v0.4b[0]\n"
+      "ldr q28, [x12, #0x30]\n"
+      ".inst 0x4f81e3f4  // sdot v20.4s, v31.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3f8  // sdot v24.4s, v31.16b, v2.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x4f80e3d1  // sdot v17.4s, v30.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3d5  // sdot v21.4s, v30.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3d9  // sdot v25.4s, v30.16b, v2.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3b6  // sdot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3ba  // sdot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e397  // sdot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e39b  // sdot v27.4s, v28.16b, v2.4b[0]\n"
       "bge 72b\n"
       "74:"  // Height 3: Multiply loop: Skip odd blocks
-      "cbz x11, 78f\n"
-      "tbz x11, #1, 75f\n"
-      "ldr h0, [x10], #0x2\n"
-      "ldr h1, [x23], #0x2\n"
-      "ldr h2, [x22], #0x2\n"
-      "tbz x11, #0, 76f\n"
-      "ld1 { v0.b }[2], [x10]\n"
-      "ld1 { v1.b }[2], [x23]\n"
-      "ld1 { v2.b }[2], [x22]\n"
+      "cbz x10, 78f\n"
+      "tbz x10, #1, 75f\n"
+      "ldr h0, [x9], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x27], #0x2\n"
+      "tbz x10, #0, 76f\n"
+      "ld1 { v0.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x27]\n"
       "b 76f\n"
       "75:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x10, #0x0]\n"
-      "ldr b1, [x23, #0x0]\n"
-      "ldr b2, [x22, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x27, #0x0]\n"
       "76:"  // Height 3: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 77f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "77:"  // Height 3: Multiply loop: unique 12: skip row sum
-      "ldr q10, [x13, #0x0]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x13, #0x10]\n"
-      ".inst 0x4f81e154  // sdot v20.4s, v10.16b, v1.4b[0]\n"
-      "ldr q5, [x13, #0x20]\n"
-      ".inst 0x4f82e158  // sdot v24.4s, v10.16b, v2.4b[0]\n"
-      "ldr q6, [x13, #0x30]\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x4f81e095  // sdot v21.4s, v4.16b, v1.4b[0]\n"
-      "add x13, x13, #0x40\n"
-      ".inst 0x4f82e099  // sdot v25.4s, v4.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0ba  // sdot v26.4s, v5.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d7  // sdot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0db  // sdot v27.4s, v6.16b, v2.4b[0]\n"
+      "ldr q28, [x12, #0x0]\n"
+      ".inst 0x4f80e390  // sdot v16.4s, v28.16b, v0.4b[0]\n"
+      "ldr q30, [x12, #0x10]\n"
+      ".inst 0x4f81e394  // sdot v20.4s, v28.16b, v1.4b[0]\n"
+      "ldr q29, [x12, #0x20]\n"
+      ".inst 0x4f82e398  // sdot v24.4s, v28.16b, v2.4b[0]\n"
+      "ldr q28, [x12, #0x30]\n"
+      ".inst 0x4f80e3d1  // sdot v17.4s, v30.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3d5  // sdot v21.4s, v30.16b, v1.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x4f82e3d9  // sdot v25.4s, v30.16b, v2.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3b6  // sdot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3ba  // sdot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e397  // sdot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e39b  // sdot v27.4s, v28.16b, v2.4b[0]\n"
       "78:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x12, x12, #0x1\n"
-      "cmp x12, x20\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 64b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x14, x20\n"
-      "add x21, x22, x20\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
+      "add x23, x13, x20\n"
+      "add x22, x23, x20\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "tbnz %x[flags], #31, 79f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v3.4s }, [x23]\n"
-      "neg v3.4s, v3.4s\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "neg v28.4s, v28.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "mul v11.4s, v11.4s, v3.4s\n"
-      "mul v12.4s, v12.4s, v3.4s\n"
-      "mul v13.4s, v13.4s, v3.4s\n"
+      "mul v11.4s, v11.4s, v28.4s\n"
+      "mul v12.4s, v12.4s, v28.4s\n"
+      "mul v13.4s, v13.4s, v28.4s\n"
       "79:"  // Height 3: skip row sum fixup
-      "ldr q0, [x16, #0x0]\n"
+      "ldr q31, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q1, [x16, #0x10]\n"
+      "ldr q30, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x16, #0x20]\n"
+      "ldr q29, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q3, [x16, #0x30]\n"
+      "ldr q28, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
@@ -1171,73 +1170,73 @@
       "add v25.4s, v25.4s, v13.4s\n"
       "add v26.4s, v26.4s, v13.4s\n"
       "add v27.4s, v27.4s, v13.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add v16.4s, v16.4s, v31.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v31.4s\n"
+      "add v21.4s, v21.4s, v30.4s\n"
+      "add v22.4s, v22.4s, v29.4s\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v31.4s\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
-      "add x16, x16, #0x40\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v28.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v28.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v28.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v28.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v28.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v28.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v28.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 80f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "and v1.16b, v16.16b, v0.16b\n"
+      "and v31.16b, v17.16b, v0.16b\n"
+      "and v30.16b, v18.16b, v0.16b\n"
+      "and v29.16b, v19.16b, v0.16b\n"
+      "and v28.16b, v20.16b, v0.16b\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v1.4s\n"
+      "sqadd v17.4s, v17.4s, v31.4s\n"
+      "sqadd v18.4s, v18.4s, v30.4s\n"
+      "sqadd v19.4s, v19.4s, v29.4s\n"
+      "sqadd v20.4s, v20.4s, v28.4s\n"
+      "and v3.16b, v21.16b, v0.16b\n"
+      "and v2.16b, v22.16b, v0.16b\n"
+      "and v1.16b, v23.16b, v0.16b\n"
+      "and v31.16b, v24.16b, v0.16b\n"
+      "and v30.16b, v25.16b, v0.16b\n"
+      "and v29.16b, v26.16b, v0.16b\n"
+      "and v28.16b, v27.16b, v0.16b\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v3.4s\n"
+      "sqadd v22.4s, v22.4s, v2.4s\n"
+      "sqadd v23.4s, v23.4s, v1.4s\n"
+      "sqadd v24.4s, v24.4s, v31.4s\n"
+      "sqadd v25.4s, v25.4s, v30.4s\n"
+      "sqadd v26.4s, v26.4s, v29.4s\n"
+      "sqadd v27.4s, v27.4s, v28.4s\n"
       "80:"  // Height 3: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
@@ -1251,156 +1250,156 @@
       "srshl v25.4s, v25.4s, v0.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "add v17.4s, v17.4s, v28.4s\n"
+      "add v18.4s, v18.4s, v28.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v28.4s\n"
+      "add v21.4s, v21.4s, v28.4s\n"
+      "add v22.4s, v22.4s, v28.4s\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v28.4s\n"
+      "add v25.4s, v25.4s, v28.4s\n"
+      "add v26.4s, v26.4s, v28.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v28.4s\n"
+      "smin v17.4s, v17.4s, v28.4s\n"
+      "smin v18.4s, v18.4s, v28.4s\n"
+      "smin v19.4s, v19.4s, v28.4s\n"
+      "smin v20.4s, v20.4s, v28.4s\n"
+      "smin v21.4s, v21.4s, v28.4s\n"
+      "smin v22.4s, v22.4s, v28.4s\n"
+      "smin v23.4s, v23.4s, v28.4s\n"
+      "smin v24.4s, v24.4s, v28.4s\n"
+      "smin v25.4s, v25.4s, v28.4s\n"
+      "smin v26.4s, v26.4s, v28.4s\n"
+      "smin v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v28.4s\n"
+      "smax v17.4s, v17.4s, v28.4s\n"
+      "smax v18.4s, v18.4s, v28.4s\n"
+      "smax v19.4s, v19.4s, v28.4s\n"
+      "smax v20.4s, v20.4s, v28.4s\n"
+      "smax v21.4s, v21.4s, v28.4s\n"
+      "smax v22.4s, v22.4s, v28.4s\n"
+      "smax v23.4s, v23.4s, v28.4s\n"
+      "smax v24.4s, v24.4s, v28.4s\n"
+      "smax v25.4s, v25.4s, v28.4s\n"
+      "smax v26.4s, v26.4s, v28.4s\n"
+      "smax v27.4s, v27.4s, v28.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v18.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
-      "cmp x15, #0x10\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "cmp x14, #0x10\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v20.16b, v20.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 89f\n"
-      "tbz x15, #3, 84f\n"
-      "str d16, [x14], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
-      "tbz x15, #2, 82f\n"
-      "st1 { v16.s }[2], [x14], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
-      "tbz x15, #1, 81f\n"
-      "st1 { v16.h }[6], [x14], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
-      "tbz x15, #0, 88f\n"
-      "st1 { v16.b }[14], [x14]\n"
-      "st1 { v20.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
+      "tbz x14, #3, 84f\n"
+      "str d16, [x13], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x14, #2, 82f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "tbz x14, #1, 81f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[14], [x13]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 88f\n"
       "81:"  // Height 3: Partial direct writeback: partial_1_12
-      "tbz x15, #0, 88f\n"
-      "st1 { v16.b }[12], [x14]\n"
-      "st1 { v20.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[12], [x13]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 88f\n"
       "82:"  // Height 3: Partial direct writeback: partial_2_8
-      "tbz x15, #1, 83f\n"
-      "st1 { v16.h }[4], [x14], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
-      "tbz x15, #0, 88f\n"
-      "st1 { v16.b }[10], [x14]\n"
-      "st1 { v20.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
+      "tbz x14, #1, 83f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[10], [x13]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 88f\n"
       "83:"  // Height 3: Partial direct writeback: partial_1_8
-      "tbz x15, #0, 88f\n"
-      "st1 { v16.b }[8], [x14]\n"
-      "st1 { v20.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[8], [x13]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 88f\n"
       "84:"  // Height 3: Partial direct writeback: partial_4_0
-      "tbz x15, #2, 86f\n"
-      "str s16, [x14], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
-      "tbz x15, #1, 85f\n"
-      "st1 { v16.h }[2], [x14], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
-      "tbz x15, #0, 88f\n"
-      "st1 { v16.b }[6], [x14]\n"
-      "st1 { v20.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
+      "tbz x14, #2, 86f\n"
+      "str s16, [x13], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "tbz x14, #1, 85f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[6], [x13]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 88f\n"
       "85:"  // Height 3: Partial direct writeback: partial_1_4
-      "tbz x15, #0, 88f\n"
-      "st1 { v16.b }[4], [x14]\n"
-      "st1 { v20.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[4], [x13]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 88f\n"
       "86:"  // Height 3: Partial direct writeback: partial_2_0
-      "tbz x15, #1, 87f\n"
-      "str h16, [x14], #0x2\n"
-      "str h20, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
-      "tbz x15, #0, 88f\n"
-      "st1 { v16.b }[2], [x14]\n"
-      "st1 { v20.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
+      "tbz x14, #1, 87f\n"
+      "str h16, [x13], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[2], [x13]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 88f\n"
       "87:"  // Height 3: Partial direct writeback: partial_1_0
-      "str b16, [x14, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "88:"  // Height 3: Partial direct writeback: Done
       "b 90f\n"
       "89:"  // Height 3: Full writeback
-      "str q16, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q20, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "90:"  // Height 3: Writeback done
-      "subs x15, x15, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 62b\n"
       "b 122f\n"
       "91:"  // Height 4
       "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "mov x20, #0x4\n"
-      "mov x16, %x[col_bias]\n"
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v12.4s, #0x0\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v13.4s, #0x0\n"
-      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v14.4s, #0x0\n"
-      "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "movi v15.16b, #0x1\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x13, %x[output_ptr]\n"
       "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "92:"  // Height 4: Column loop
       "movi v16.4s, #0x0\n"
@@ -1420,117 +1419,117 @@
       "movi v30.4s, #0x0\n"
       "movi v31.4s, #0x0\n"
       "93:"  // Height 4: setup done
-      "mov x12, #0x0\n"
+      "mov x11, #0x0\n"
       "94:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w11, [x20, x12, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 95f\n"
-      "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x10, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
-      "cbnz x12, 96f\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x27, [x20, #0x10]\n"
+      "ldr x26, [x20, #0x18]\n"
+      "cbnz x11, 96f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x10, x10, x20\n"
-      "add x23, x23, x20\n"
-      "add x22, x22, x20\n"
-      "add x21, x21, x20\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
+      "add x27, x27, x20\n"
+      "add x26, x26, x20\n"
       "b 96f\n"
       "95:"  // Height 4: setup direct input
-      "mov x10, %x[input_ptr]\n"
-      "add x23, x10, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "mov x9, %x[input_ptr]\n"
+      "add x28, x9, x21\n"
+      "add x27, x28, x21\n"
+      "add x26, x27, x21\n"
       "96:"  // Height 4: input setup done
-      "cmp x11, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 101f\n"
-      "ldr q0, [x10, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q1, [x23, #0x0]\n"
-      "ldr q2, [x22, #0x0]\n"
-      "ldr q3, [x21, #0x0]\n"
-      "ldr q4, [x13, #0x0]\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q3, [x26, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 99f\n"
       "97:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr x9, [x13, #0x78]\n"
+      "ldr x22, [x12, #0x78]\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr x28, [x13, #0x88]\n"
+      "ldr x21, [x12, #0x88]\n"
       ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr x27, [x13, #0x98]\n"
+      "ldr x20, [x12, #0x98]\n"
       ".inst 0x4f83e09c  // sdot v28.4s, v4.16b, v3.4b[0]\n"
-      "ldr d4, [x13, #0x70]\n"
+      "ldr d4, [x12, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "mov v4.d[1], x9\n"
+      "mov v4.d[1], x22\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr x26, [x13, #0xa8]\n"
+      "ldr x25, [x12, #0xa8]\n"
       ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr x25, [x13, #0xb8]\n"
+      "ldr x24, [x12, #0xb8]\n"
       ".inst 0x4f83e0bd  // sdot v29.4s, v5.16b, v3.4b[0]\n"
-      "ldr d5, [x13, #0x80]\n"
+      "ldr d5, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "mov v5.d[1], x28\n"
+      "mov v5.d[1], x21\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr x24, [x13, #0xc8]\n"
+      "ldr x23, [x12, #0xc8]\n"
       ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr x20, [x13, #0xd8]\n"
+      "ldr x22, [x12, #0xd8]\n"
       ".inst 0x4f83e0de  // sdot v30.4s, v6.16b, v3.4b[0]\n"
-      "ldr d6, [x13, #0x90]\n"
+      "ldr d6, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x27\n"
+      "mov v6.d[1], x20\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr x9, [x13, #0xe8]\n"
+      "ldr x21, [x12, #0xe8]\n"
       ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr x28, [x13, #0xf8]\n"
+      "ldr x20, [x12, #0xf8]\n"
       ".inst 0x4f83e0ff  // sdot v31.4s, v7.16b, v3.4b[0]\n"
-      "ldr d7, [x13, #0xa0]\n"
+      "ldr d7, [x12, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "mov v7.d[1], x26\n"
+      "mov v7.d[1], x25\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
-      "add x10, x10, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
-      "add x23, x23, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4fa3e11c  // sdot v28.4s, v8.16b, v3.4b[1]\n"
-      "ldr d8, [x13, #0xb0]\n"
+      "ldr d8, [x12, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "mov v8.d[1], x25\n"
+      "mov v8.d[1], x24\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
-      "add x22, x22, #0x10\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
-      "add x21, x21, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4fa3e13d  // sdot v29.4s, v9.16b, v3.4b[1]\n"
-      "ldr d9, [x13, #0xc0]\n"
+      "ldr d9, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      "mov v9.d[1], x24\n"
+      "mov v9.d[1], x23\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
       ".inst 0x4fa3e15e  // sdot v30.4s, v10.16b, v3.4b[1]\n"
-      "ldr d10, [x13, #0xd0]\n"
+      "ldr d10, [x12, #0xd0]\n"
       ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      "mov v10.d[1], x20\n"
+      "mov v10.d[1], x22\n"
       ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
       ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
       ".inst 0x4fa3e09f  // sdot v31.4s, v4.16b, v3.4b[1]\n"
-      "ldr d4, [x13, #0xe0]\n"
+      "ldr d4, [x12, #0xe0]\n"
       ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      "mov v4.d[1], x9\n"
+      "mov v4.d[1], x21\n"
       ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
       ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
       ".inst 0x4f83e8bc  // sdot v28.4s, v5.16b, v3.4b[2]\n"
-      "ldr d5, [x13, #0xf0]\n"
+      "ldr d5, [x12, #0xf0]\n"
       ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "mov v5.d[1], x28\n"
+      "mov v5.d[1], x20\n"
       ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      "add x13, x13, #0x100\n"
+      "add x12, x12, #0x100\n"
       ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8dd  // sdot v29.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
@@ -1563,77 +1562,77 @@
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
       "98:"  // Height 4: Multiply loop: unique 13: skip row sum
-      "ldr q0, [x10, #0x0]\n"
-      "sub x11, x11, #0x10\n"
-      "ldr q1, [x23, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q2, [x22, #0x0]\n"
-      "ldr q3, [x21, #0x0]\n"
-      "ldr q4, [x13, #0x0]\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
-      "prfm pldl1keep, [x10, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q1, [x28, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q3, [x26, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "bge 97b\n"
       "99:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "sub x11, x11, #0x10\n"
+      "sub x10, x10, #0x10\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "add x10, x10, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4f83e09c  // sdot v28.4s, v4.16b, v3.4b[0]\n"
-      "ldr q4, [x13, #0x70]\n"
+      "ldr q4, [x12, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "add x22, x22, #0x10\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "add x21, x21, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
       ".inst 0x4f83e0bd  // sdot v29.4s, v5.16b, v3.4b[0]\n"
-      "ldr q5, [x13, #0x80]\n"
+      "ldr q5, [x12, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
       ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
       ".inst 0x4f83e0de  // sdot v30.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x13, #0x90]\n"
+      "ldr q6, [x12, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x4f83e0ff  // sdot v31.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x13, #0xa0]\n"
+      "ldr q7, [x12, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
       ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
       ".inst 0x4fa3e11c  // sdot v28.4s, v8.16b, v3.4b[1]\n"
-      "ldr q8, [x13, #0xb0]\n"
+      "ldr q8, [x12, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
       ".inst 0x4fa3e13d  // sdot v29.4s, v9.16b, v3.4b[1]\n"
-      "ldr q9, [x13, #0xc0]\n"
+      "ldr q9, [x12, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
       ".inst 0x4fa3e15e  // sdot v30.4s, v10.16b, v3.4b[1]\n"
-      "ldr q10, [x13, #0xd0]\n"
+      "ldr q10, [x12, #0xd0]\n"
       ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
       ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
       ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
       ".inst 0x4fa3e09f  // sdot v31.4s, v4.16b, v3.4b[1]\n"
-      "ldr q4, [x13, #0xe0]\n"
+      "ldr q4, [x12, #0xe0]\n"
       ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
       ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
       ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
       ".inst 0x4f83e8bc  // sdot v28.4s, v5.16b, v3.4b[2]\n"
-      "ldr q5, [x13, #0xf0]\n"
+      "ldr q5, [x12, #0xf0]\n"
       ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      "add x13, x13, #0x100\n"
+      "add x12, x12, #0x100\n"
       ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8dd  // sdot v29.4s, v6.16b, v3.4b[2]\n"
@@ -1667,67 +1666,67 @@
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
       "100:"  // Height 4: Multiply loop: unique 14: skip row sum
-      "prfm pldl1keep, [x10, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "101:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x11, 108f\n"
-      "cmp x11, #0x4\n"
+      "cbz x10, 108f\n"
+      "cmp x10, #0x4\n"
       "blt 104f\n"
       "102:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x10], #0x4\n"
-      "ldr s1, [x23], #0x4\n"
-      "ldr s2, [x22], #0x4\n"
-      "ldr s3, [x21], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x27], #0x4\n"
+      "ldr s3, [x26], #0x4\n"
       "tbnz %x[flags], #31, 103f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
       "103:"  // Height 4: Multiply loop: unique 15: skip row sum
-      "ldr q6, [x13, #0x0]\n"
-      "sub x11, x11, #0x4\n"
-      "ldr q7, [x13, #0x10]\n"
-      "cmp x11, #0x4\n"
-      "ldr q8, [x13, #0x20]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x13, #0x30]\n"
-      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d8  // sdot v24.4s, v6.16b, v2.4b[0]\n"
-      "add x13, x13, #0x40\n"
-      ".inst 0x4f83e0dc  // sdot v28.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f9  // sdot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0fd  // sdot v29.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x4f82e11a  // sdot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x4f83e11e  // sdot v30.4s, v8.16b, v3.4b[0]\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x4f82e13b  // sdot v27.4s, v9.16b, v2.4b[0]\n"
-      ".inst 0x4f83e13f  // sdot v31.4s, v9.16b, v3.4b[0]\n"
+      "ldr q7, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q6, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q5, [x12, #0x20]\n"
+      ".inst 0x4f80e0f0  // sdot v16.4s, v7.16b, v0.4b[0]\n"
+      "ldr q4, [x12, #0x30]\n"
+      ".inst 0x4f81e0f4  // sdot v20.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f8  // sdot v24.4s, v7.16b, v2.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x4f83e0fc  // sdot v28.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d9  // sdot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0dd  // sdot v29.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0ba  // sdot v26.4s, v5.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0be  // sdot v30.4s, v5.16b, v3.4b[0]\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4f81e097  // sdot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4f82e09b  // sdot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x4f83e09f  // sdot v31.4s, v4.16b, v3.4b[0]\n"
       "bge 102b\n"
       "104:"  // Height 4: Multiply loop: Skip odd blocks
-      "cbz x11, 108f\n"
-      "tbz x11, #1, 105f\n"
-      "ldr h0, [x10], #0x2\n"
-      "ldr h1, [x23], #0x2\n"
-      "ldr h2, [x22], #0x2\n"
-      "ldr h3, [x21], #0x2\n"
-      "tbz x11, #0, 106f\n"
-      "ld1 { v0.b }[2], [x10]\n"
-      "ld1 { v1.b }[2], [x23]\n"
-      "ld1 { v2.b }[2], [x22]\n"
-      "ld1 { v3.b }[2], [x21]\n"
+      "cbz x10, 108f\n"
+      "tbz x10, #1, 105f\n"
+      "ldr h0, [x9], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x27], #0x2\n"
+      "ldr h3, [x26], #0x2\n"
+      "tbz x10, #0, 106f\n"
+      "ld1 { v0.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x27]\n"
+      "ld1 { v3.b }[2], [x26]\n"
       "b 106f\n"
       "105:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x10, #0x0]\n"
-      "ldr b1, [x23, #0x0]\n"
-      "ldr b2, [x22, #0x0]\n"
-      "ldr b3, [x21, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x27, #0x0]\n"
+      "ldr b3, [x26, #0x0]\n"
       "106:"  // Height 4: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 107f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
@@ -1735,64 +1734,64 @@
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
       "107:"  // Height 4: Multiply loop: unique 16: skip row sum
-      "ldr q10, [x13, #0x0]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x13, #0x10]\n"
-      ".inst 0x4f81e154  // sdot v20.4s, v10.16b, v1.4b[0]\n"
-      "ldr q5, [x13, #0x20]\n"
-      ".inst 0x4f82e158  // sdot v24.4s, v10.16b, v2.4b[0]\n"
-      "ldr q6, [x13, #0x30]\n"
-      ".inst 0x4f83e15c  // sdot v28.4s, v10.16b, v3.4b[0]\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      "add x13, x13, #0x40\n"
-      ".inst 0x4f81e095  // sdot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x4f82e099  // sdot v25.4s, v4.16b, v2.4b[0]\n"
-      ".inst 0x4f83e09d  // sdot v29.4s, v4.16b, v3.4b[0]\n"
+      "ldr q7, [x12, #0x0]\n"
+      ".inst 0x4f80e0f0  // sdot v16.4s, v7.16b, v0.4b[0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      ".inst 0x4f81e0f4  // sdot v20.4s, v7.16b, v1.4b[0]\n"
+      "ldr q5, [x12, #0x20]\n"
+      ".inst 0x4f82e0f8  // sdot v24.4s, v7.16b, v2.4b[0]\n"
+      "ldr q4, [x12, #0x30]\n"
+      ".inst 0x4f83e0fc  // sdot v28.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d9  // sdot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0dd  // sdot v29.4s, v6.16b, v3.4b[0]\n"
       ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
       ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
       ".inst 0x4f82e0ba  // sdot v26.4s, v5.16b, v2.4b[0]\n"
       ".inst 0x4f83e0be  // sdot v30.4s, v5.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d7  // sdot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0db  // sdot v27.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0df  // sdot v31.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4f81e097  // sdot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4f82e09b  // sdot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x4f83e09f  // sdot v31.4s, v4.16b, v3.4b[0]\n"
       "108:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x12, x12, #0x1\n"
-      "cmp x12, x20\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 94b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x14, x20\n"
+      "add x23, x13, x20\n"
+      "add x22, x23, x20\n"
       "add x21, x22, x20\n"
-      "add x20, x21, x20\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "prfm pstl1keep, [x22, #0x0]\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
       "tbnz %x[flags], #31, 109f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
-      "neg v4.4s, v4.4s\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "neg v0.4s, v0.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "mul v11.4s, v11.4s, v4.4s\n"
-      "mul v12.4s, v12.4s, v4.4s\n"
-      "mul v13.4s, v13.4s, v4.4s\n"
-      "mul v14.4s, v14.4s, v4.4s\n"
+      "mul v11.4s, v11.4s, v0.4s\n"
+      "mul v12.4s, v12.4s, v0.4s\n"
+      "mul v13.4s, v13.4s, v0.4s\n"
+      "mul v14.4s, v14.4s, v0.4s\n"
       "109:"  // Height 4: skip row sum fixup
-      "ldr q0, [x16, #0x0]\n"
+      "ldr q3, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q1, [x16, #0x10]\n"
+      "ldr q2, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x16, #0x20]\n"
+      "ldr q1, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q3, [x16, #0x30]\n"
+      "ldr q0, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
@@ -1806,93 +1805,93 @@
       "add v29.4s, v29.4s, v14.4s\n"
       "add v30.4s, v30.4s, v14.4s\n"
       "add v31.4s, v31.4s, v14.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "add v28.4s, v28.4s, v0.4s\n"
-      "add v29.4s, v29.4s, v1.4s\n"
-      "add v30.4s, v30.4s, v2.4s\n"
-      "add v31.4s, v31.4s, v3.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add v16.4s, v16.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v2.4s\n"
+      "add v18.4s, v18.4s, v1.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v1.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v2.4s\n"
+      "add v26.4s, v26.4s, v1.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v2.4s\n"
+      "add v30.4s, v30.4s, v1.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
-      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
-      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
-      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
-      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
-      "add x16, x16, #0x40\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v1.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v1.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 110f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v2.16b, v16.16b, v0.16b\n"
+      "and v1.16b, v17.16b, v0.16b\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v2.4s\n"
+      "sqadd v17.4s, v17.4s, v1.4s\n"
+      "and v7.16b, v18.16b, v0.16b\n"
+      "and v6.16b, v19.16b, v0.16b\n"
+      "and v5.16b, v20.16b, v0.16b\n"
+      "and v4.16b, v21.16b, v0.16b\n"
+      "and v3.16b, v22.16b, v0.16b\n"
+      "and v2.16b, v23.16b, v0.16b\n"
+      "and v1.16b, v24.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "and v9.16b, v28.16b, v0.16b\n"
-      "and v10.16b, v29.16b, v0.16b\n"
-      "and v4.16b, v30.16b, v0.16b\n"
-      "and v5.16b, v31.16b, v0.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
-      "sqadd v28.4s, v28.4s, v9.4s\n"
-      "sqadd v29.4s, v29.4s, v10.4s\n"
-      "sqadd v30.4s, v30.4s, v4.4s\n"
-      "sqadd v31.4s, v31.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v7.4s\n"
+      "sqadd v19.4s, v19.4s, v6.4s\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v4.4s\n"
+      "sqadd v22.4s, v22.4s, v3.4s\n"
+      "sqadd v23.4s, v23.4s, v2.4s\n"
+      "sqadd v24.4s, v24.4s, v1.4s\n"
+      "and v7.16b, v25.16b, v0.16b\n"
+      "and v6.16b, v26.16b, v0.16b\n"
+      "and v5.16b, v27.16b, v0.16b\n"
+      "and v4.16b, v28.16b, v0.16b\n"
+      "and v3.16b, v29.16b, v0.16b\n"
+      "and v2.16b, v30.16b, v0.16b\n"
+      "and v1.16b, v31.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v7.4s\n"
+      "sqadd v26.4s, v26.4s, v6.4s\n"
+      "sqadd v27.4s, v27.4s, v5.4s\n"
+      "sqadd v28.4s, v28.4s, v4.4s\n"
+      "sqadd v29.4s, v29.4s, v3.4s\n"
+      "sqadd v30.4s, v30.4s, v2.4s\n"
+      "sqadd v31.4s, v31.4s, v1.4s\n"
       "110:"  // Height 4: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
@@ -1910,172 +1909,172 @@
       "srshl v29.4s, v29.4s, v0.4s\n"
       "srshl v30.4s, v30.4s, v0.4s\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v0.4s\n"
+      "add v18.4s, v18.4s, v0.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v0.4s\n"
+      "add v22.4s, v22.4s, v0.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v25.4s, v25.4s, v0.4s\n"
+      "add v26.4s, v26.4s, v0.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v0.4s\n"
+      "add v29.4s, v29.4s, v0.4s\n"
+      "add v30.4s, v30.4s, v0.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v0.4s\n"
+      "smin v17.4s, v17.4s, v0.4s\n"
+      "smin v18.4s, v18.4s, v0.4s\n"
+      "smin v19.4s, v19.4s, v0.4s\n"
+      "smin v20.4s, v20.4s, v0.4s\n"
+      "smin v21.4s, v21.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v0.4s\n"
+      "smin v23.4s, v23.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v0.4s\n"
+      "smin v25.4s, v25.4s, v0.4s\n"
+      "smin v26.4s, v26.4s, v0.4s\n"
+      "smin v27.4s, v27.4s, v0.4s\n"
+      "smin v28.4s, v28.4s, v0.4s\n"
+      "smin v29.4s, v29.4s, v0.4s\n"
+      "smin v30.4s, v30.4s, v0.4s\n"
+      "smin v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v0.4s\n"
+      "smax v17.4s, v17.4s, v0.4s\n"
+      "smax v18.4s, v18.4s, v0.4s\n"
+      "smax v19.4s, v19.4s, v0.4s\n"
+      "smax v20.4s, v20.4s, v0.4s\n"
+      "smax v21.4s, v21.4s, v0.4s\n"
+      "smax v22.4s, v22.4s, v0.4s\n"
+      "smax v23.4s, v23.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v0.4s\n"
+      "smax v25.4s, v25.4s, v0.4s\n"
+      "smax v26.4s, v26.4s, v0.4s\n"
+      "smax v27.4s, v27.4s, v0.4s\n"
+      "smax v28.4s, v28.4s, v0.4s\n"
+      "smax v29.4s, v29.4s, v0.4s\n"
+      "smax v30.4s, v30.4s, v0.4s\n"
+      "smax v31.4s, v31.4s, v0.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v0.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v19.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v18.8h, v26.8h, v27.8h\n"
       "uzp1 v28.8h, v28.8h, v29.8h\n"
-      "uzp1 v29.8h, v30.8h, v31.8h\n"
-      "cmp x15, #0x10\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
-      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "uzp1 v17.8h, v30.8h, v31.8h\n"
+      "cmp x14, #0x10\n"
+      "uzp1 v16.16b, v16.16b, v0.16b\n"
+      "uzp1 v20.16b, v20.16b, v19.16b\n"
+      "uzp1 v24.16b, v24.16b, v18.16b\n"
+      "uzp1 v28.16b, v28.16b, v17.16b\n"
       "bge 119f\n"
-      "tbz x15, #3, 114f\n"
-      "str d16, [x14], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
-      "str d28, [x20], #0x8\n"
-      "tbz x15, #2, 112f\n"
-      "st1 { v16.s }[2], [x14], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
-      "st1 { v28.s }[2], [x20], #0x4\n"
-      "tbz x15, #1, 111f\n"
-      "st1 { v16.h }[6], [x14], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
-      "st1 { v28.h }[6], [x20], #0x2\n"
-      "tbz x15, #0, 118f\n"
-      "st1 { v16.b }[14], [x14]\n"
-      "st1 { v20.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
-      "st1 { v28.b }[14], [x20]\n"
+      "tbz x14, #3, 114f\n"
+      "str d16, [x13], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x14, #2, 112f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x14, #1, 111f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[14], [x13]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "st1 { v28.b }[14], [x21]\n"
       "b 118f\n"
       "111:"  // Height 4: Partial direct writeback: partial_1_12
-      "tbz x15, #0, 118f\n"
-      "st1 { v16.b }[12], [x14]\n"
-      "st1 { v20.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
-      "st1 { v28.b }[12], [x20]\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[12], [x13]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "st1 { v28.b }[12], [x21]\n"
       "b 118f\n"
       "112:"  // Height 4: Partial direct writeback: partial_2_8
-      "tbz x15, #1, 113f\n"
-      "st1 { v16.h }[4], [x14], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
-      "st1 { v28.h }[4], [x20], #0x2\n"
-      "tbz x15, #0, 118f\n"
-      "st1 { v16.b }[10], [x14]\n"
-      "st1 { v20.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
-      "st1 { v28.b }[10], [x20]\n"
+      "tbz x14, #1, 113f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[10], [x13]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "st1 { v28.b }[10], [x21]\n"
       "b 118f\n"
       "113:"  // Height 4: Partial direct writeback: partial_1_8
-      "tbz x15, #0, 118f\n"
-      "st1 { v16.b }[8], [x14]\n"
-      "st1 { v20.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
-      "st1 { v28.b }[8], [x20]\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[8], [x13]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "st1 { v28.b }[8], [x21]\n"
       "b 118f\n"
       "114:"  // Height 4: Partial direct writeback: partial_4_0
-      "tbz x15, #2, 116f\n"
-      "str s16, [x14], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
-      "str s28, [x20], #0x4\n"
-      "tbz x15, #1, 115f\n"
-      "st1 { v16.h }[2], [x14], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
-      "st1 { v28.h }[2], [x20], #0x2\n"
-      "tbz x15, #0, 118f\n"
-      "st1 { v16.b }[6], [x14]\n"
-      "st1 { v20.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
-      "st1 { v28.b }[6], [x20]\n"
+      "tbz x14, #2, 116f\n"
+      "str s16, [x13], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "str s28, [x21], #0x4\n"
+      "tbz x14, #1, 115f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[6], [x13]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "st1 { v28.b }[6], [x21]\n"
       "b 118f\n"
       "115:"  // Height 4: Partial direct writeback: partial_1_4
-      "tbz x15, #0, 118f\n"
-      "st1 { v16.b }[4], [x14]\n"
-      "st1 { v20.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
-      "st1 { v28.b }[4], [x20]\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[4], [x13]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "st1 { v28.b }[4], [x21]\n"
       "b 118f\n"
       "116:"  // Height 4: Partial direct writeback: partial_2_0
-      "tbz x15, #1, 117f\n"
-      "str h16, [x14], #0x2\n"
-      "str h20, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
-      "str h28, [x20], #0x2\n"
-      "tbz x15, #0, 118f\n"
-      "st1 { v16.b }[2], [x14]\n"
-      "st1 { v20.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
-      "st1 { v28.b }[2], [x20]\n"
+      "tbz x14, #1, 117f\n"
+      "str h16, [x13], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "str h28, [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[2], [x13]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "st1 { v28.b }[2], [x21]\n"
       "b 118f\n"
       "117:"  // Height 4: Partial direct writeback: partial_1_0
-      "str b16, [x14, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
-      "str b28, [x20, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
       "118:"  // Height 4: Partial direct writeback: Done
       "b 120f\n"
       "119:"  // Height 4: Full writeback
-      "str q16, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q20, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
-      "str q28, [x20, #0x0]\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
       "120:"  // Height 4: Writeback done
-      "subs x15, x15, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 92b\n"
       "subs %x[M], %x[M], #0x4\n"
       "beq 122f\n"
@@ -2089,10 +2088,9 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "122:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
index 485a47d..3b773a6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
@@ -78,7 +78,6 @@
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x4\n"
       "bge 91f\n"
@@ -102,11 +101,11 @@
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
       "cbnz x26, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -128,32 +127,32 @@
       "blt 9f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q21, [x28, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q20, [x28, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q26, [x28, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q25, [x28, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q24, [x28, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q23, [x28, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
+      "ldr q22, [x28, #0xd0]\n"
+      ".inst 0x4fa0e2b3  // sdot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr q21, [x28, #0xe0]\n"
+      ".inst 0x4f80ea90  // sdot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr q20, [x28, #0xf0]\n"
+      ".inst 0x4f80eb51  // sdot v17.4s, v26.16b, v0.4b[2]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x4f80eb32  // sdot v18.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f80eb13  // sdot v19.4s, v24.16b, v0.4b[2]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eaf0  // sdot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ead1  // sdot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eab2  // sdot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea93  // sdot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 8f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "8:"  // Height 1: Multiply loop: unique 1: skip row sum
@@ -171,33 +170,33 @@
       "bge 7b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q21, [x28, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q20, [x28, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q26, [x28, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q25, [x28, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q24, [x28, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q23, [x28, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
+      "ldr q22, [x28, #0xd0]\n"
+      ".inst 0x4fa0e2b3  // sdot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr q21, [x28, #0xe0]\n"
+      ".inst 0x4f80ea90  // sdot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr q20, [x28, #0xf0]\n"
+      ".inst 0x4f80eb51  // sdot v17.4s, v26.16b, v0.4b[2]\n"
       "sub x25, x25, #0x10\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x4f80eb32  // sdot v18.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f80eb13  // sdot v19.4s, v24.16b, v0.4b[2]\n"
       "add x24, x24, #0x10\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eaf0  // sdot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ead1  // sdot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eab2  // sdot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea93  // sdot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 10f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "10:"  // Height 1: Multiply loop: unique 2: skip row sum
@@ -211,16 +210,16 @@
       "tbnz %x[flags], #31, 13f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "13:"  // Height 1: Multiply loop: unique 3: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      "ldr q7, [x28, #0x10]\n"
+      "ldr q23, [x28, #0x0]\n"
+      "ldr q22, [x28, #0x10]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x4\n"
-      "ldr q8, [x28, #0x20]\n"
-      "ldr q9, [x28, #0x30]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
+      "ldr q21, [x28, #0x20]\n"
+      "ldr q20, [x28, #0x30]\n"
+      ".inst 0x4f80e2f0  // sdot v16.4s, v23.16b, v0.4b[0]\n"
+      ".inst 0x4f80e2d1  // sdot v17.4s, v22.16b, v0.4b[0]\n"
+      ".inst 0x4f80e2b2  // sdot v18.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f80e293  // sdot v19.4s, v20.16b, v0.4b[0]\n"
       "add x28, x28, #0x40\n"
       "bge 12b\n"
       "14:"  // Height 1: Multiply loop: Skip odd blocks
@@ -236,14 +235,14 @@
       "tbnz %x[flags], #31, 17f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "17:"  // Height 1: Multiply loop: unique 4: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x20]\n"
-      "ldr q6, [x28, #0x30]\n"
-      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
+      "ldr q21, [x28, #0x0]\n"
+      "ldr q20, [x28, #0x10]\n"
+      ".inst 0x4f80e2b0  // sdot v16.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f80e291  // sdot v17.4s, v20.16b, v0.4b[0]\n"
+      "ldr q21, [x28, #0x20]\n"
+      "ldr q20, [x28, #0x30]\n"
+      ".inst 0x4f80e2b2  // sdot v18.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f80e293  // sdot v19.4s, v20.16b, v0.4b[0]\n"
       "add x28, x28, #0x40\n"
       "18:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -252,72 +251,72 @@
       "bne 4b\n"
       "prfm pstl1keep, [x27, #0x0]\n"
       "tbnz %x[flags], #31, 19f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v1.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v20.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "neg v1.4s, v1.4s\n"
+      "neg v20.4s, v20.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "mul v11.4s, v11.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v20.4s\n"
       "19:"  // Height 1: skip row sum fixup
-      "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q24, [x10, #0x0]\n"
+      "ldr q23, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q22, [x10, #0x20]\n"
+      "ldr q21, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v20.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "add v16.4s, v16.4s, v24.4s\n"
+      "add v17.4s, v17.4s, v23.4s\n"
+      "add v18.4s, v18.4s, v22.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v19.4s, v19.4s, v21.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v20.4s\n"
       "add x10, x10, #0x40\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v20.4s\n"
       "tbz %x[flags], #5, 20f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "and v22.16b, v17.16b, v0.16b\n"
+      "and v21.16b, v18.16b, v0.16b\n"
+      "and v20.16b, v19.16b, v0.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "sqadd v17.4s, v17.4s, v22.4s\n"
+      "sqadd v18.4s, v18.4s, v21.4s\n"
+      "sqadd v19.4s, v19.4s, v20.4s\n"
       "20:"  // Height 1: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v22.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v22.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "add v18.4s, v18.4s, v22.4s\n"
+      "add v19.4s, v19.4s, v22.4s\n"
       "cmp x9, #0x10\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "smin v16.4s, v16.4s, v21.4s\n"
+      "smin v17.4s, v17.4s, v21.4s\n"
+      "smin v18.4s, v18.4s, v21.4s\n"
+      "smin v19.4s, v19.4s, v21.4s\n"
+      "smax v16.4s, v16.4s, v20.4s\n"
+      "smax v17.4s, v17.4s, v20.4s\n"
+      "smax v18.4s, v18.4s, v20.4s\n"
+      "smax v19.4s, v19.4s, v20.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
       "uzp1 v16.16b, v16.16b, v17.16b\n"
@@ -397,12 +396,12 @@
       "34:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 35f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
       "cbnz x26, 36f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -410,7 +409,7 @@
       "b 36f\n"
       "35:"  // Height 2: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
+      "add x23, x24, x21\n"
       "36:"  // Height 2: input setup done
       "cmp x25, #0x10\n"
       "blt 41f\n"
@@ -428,48 +427,48 @@
       "37:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q25, [x28, #0x70]\n"
       "add x24, x24, #0x10\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q24, [x28, #0x80]\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q30, [x28, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x4fa0e333  // sdot v19.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e337  // sdot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x4f80eb10  // sdot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb14  // sdot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr q24, [x28, #0xf0]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4f80ebd1  // sdot v17.4s, v30.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebd5  // sdot v21.4s, v30.16b, v1.4b[2]\n"
+      ".inst 0x4f80ebb2  // sdot v18.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebb6  // sdot v22.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f80eb93  // sdot v19.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb97  // sdot v23.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4fa0eb70  // sdot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb74  // sdot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb51  // sdot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb55  // sdot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb32  // sdot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb36  // sdot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb13  // sdot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb17  // sdot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 38f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
@@ -491,49 +490,49 @@
       "39:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
       ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q25, [x28, #0x70]\n"
       "sub x25, x25, #0x10\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q24, [x28, #0x80]\n"
       "add x24, x24, #0x10\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q30, [x28, #0x90]\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x4fa0e333  // sdot v19.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e337  // sdot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x4f80eb10  // sdot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb14  // sdot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr q24, [x28, #0xf0]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4f80ebd1  // sdot v17.4s, v30.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebd5  // sdot v21.4s, v30.16b, v1.4b[2]\n"
+      ".inst 0x4f80ebb2  // sdot v18.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebb6  // sdot v22.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f80eb93  // sdot v19.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb97  // sdot v23.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4fa0eb70  // sdot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb74  // sdot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb51  // sdot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb55  // sdot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb32  // sdot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb36  // sdot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa0eb13  // sdot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb17  // sdot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 40f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
@@ -551,21 +550,21 @@
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       "43:"  // Height 2: Multiply loop: unique 7: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      "ldr q7, [x28, #0x10]\n"
+      "ldr q27, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x4\n"
-      "ldr q8, [x28, #0x20]\n"
-      "ldr q9, [x28, #0x30]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x4f80e370  // sdot v16.4s, v27.16b, v0.4b[0]\n"
+      ".inst 0x4f81e374  // sdot v20.4s, v27.16b, v1.4b[0]\n"
+      ".inst 0x4f80e351  // sdot v17.4s, v26.16b, v0.4b[0]\n"
+      ".inst 0x4f81e355  // sdot v21.4s, v26.16b, v1.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
+      ".inst 0x4f80e332  // sdot v18.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e336  // sdot v22.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f80e313  // sdot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e317  // sdot v23.4s, v24.16b, v1.4b[0]\n"
       "bge 42b\n"
       "44:"  // Height 2: Multiply loop: Skip odd blocks
       "cbz x25, 48f\n"
@@ -584,209 +583,209 @@
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       "47:"  // Height 2: Multiply loop: unique 8: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      ".inst 0x4f81e154  // sdot v20.4s, v10.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x20]\n"
-      "ldr q6, [x28, #0x30]\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x4f81e095  // sdot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      ".inst 0x4f80e310  // sdot v16.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e314  // sdot v20.4s, v24.16b, v1.4b[0]\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x4f80e351  // sdot v17.4s, v26.16b, v0.4b[0]\n"
+      ".inst 0x4f81e355  // sdot v21.4s, v26.16b, v1.4b[0]\n"
+      ".inst 0x4f80e332  // sdot v18.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e336  // sdot v22.4s, v25.16b, v1.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d7  // sdot v23.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e313  // sdot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e317  // sdot v23.4s, v24.16b, v1.4b[0]\n"
       "48:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
       "cmp x26, x20\n"
       "bne 34b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x27, x20\n"
+      "add x23, x27, x20\n"
       "prfm pstl1keep, [x27, #0x0]\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "tbnz %x[flags], #31, 49f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v2.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "neg v2.4s, v2.4s\n"
+      "neg v24.4s, v24.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "mul v11.4s, v11.4s, v2.4s\n"
-      "mul v12.4s, v12.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v24.4s\n"
+      "mul v12.4s, v12.4s, v24.4s\n"
       "49:"  // Height 2: skip row sum fixup
-      "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q28, [x10, #0x0]\n"
+      "ldr q27, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q26, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "add v17.4s, v17.4s, v27.4s\n"
       "add x10, x10, #0x40\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v25.4s\n"
+      "add v20.4s, v20.4s, v28.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v21.4s, v21.4s, v27.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v23.4s, v23.4s, v25.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v24.4s\n"
       "tbz %x[flags], #5, 50f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v24.16b, v16.16b, v0.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v24.4s\n"
+      "and v30.16b, v17.16b, v0.16b\n"
+      "and v29.16b, v18.16b, v0.16b\n"
+      "and v28.16b, v19.16b, v0.16b\n"
+      "and v27.16b, v20.16b, v0.16b\n"
+      "and v26.16b, v21.16b, v0.16b\n"
+      "and v25.16b, v22.16b, v0.16b\n"
+      "and v24.16b, v23.16b, v0.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v30.4s\n"
+      "sqadd v18.4s, v18.4s, v29.4s\n"
+      "sqadd v19.4s, v19.4s, v28.4s\n"
+      "sqadd v20.4s, v20.4s, v27.4s\n"
+      "sqadd v21.4s, v21.4s, v26.4s\n"
+      "sqadd v22.4s, v22.4s, v25.4s\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
       "50:"  // Height 2: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v25.4s }, [x20]\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
       "cmp x9, #0x10\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v16.4s, v16.4s, v26.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v26.4s\n"
+      "add v20.4s, v20.4s, v26.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v23.4s, v23.4s, v26.4s\n"
+      "smin v16.4s, v16.4s, v25.4s\n"
+      "smin v17.4s, v17.4s, v25.4s\n"
+      "smin v18.4s, v18.4s, v25.4s\n"
+      "smin v19.4s, v19.4s, v25.4s\n"
+      "smin v20.4s, v20.4s, v25.4s\n"
+      "smin v21.4s, v21.4s, v25.4s\n"
+      "smin v22.4s, v22.4s, v25.4s\n"
+      "smin v23.4s, v23.4s, v25.4s\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v17.8h, v22.8h, v23.8h\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v20.16b, v20.16b, v17.16b\n"
       "bge 59f\n"
       "tbz x9, #3, 54f\n"
       "str d16, [x27], #0x8\n"
-      "str d20, [x22], #0x8\n"
+      "str d20, [x23], #0x8\n"
       "tbz x9, #2, 52f\n"
       "st1 { v16.s }[2], [x27], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
       "tbz x9, #1, 51f\n"
       "st1 { v16.h }[6], [x27], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
       "st1 { v16.b }[14], [x27]\n"
-      "st1 { v20.b }[14], [x22]\n"
+      "st1 { v20.b }[14], [x23]\n"
       "b 58f\n"
       "51:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x9, #0, 58f\n"
       "st1 { v16.b }[12], [x27]\n"
-      "st1 { v20.b }[12], [x22]\n"
+      "st1 { v20.b }[12], [x23]\n"
       "b 58f\n"
       "52:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x9, #1, 53f\n"
       "st1 { v16.h }[4], [x27], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
       "st1 { v16.b }[10], [x27]\n"
-      "st1 { v20.b }[10], [x22]\n"
+      "st1 { v20.b }[10], [x23]\n"
       "b 58f\n"
       "53:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x9, #0, 58f\n"
       "st1 { v16.b }[8], [x27]\n"
-      "st1 { v20.b }[8], [x22]\n"
+      "st1 { v20.b }[8], [x23]\n"
       "b 58f\n"
       "54:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x9, #2, 56f\n"
       "str s16, [x27], #0x4\n"
-      "str s20, [x22], #0x4\n"
+      "str s20, [x23], #0x4\n"
       "tbz x9, #1, 55f\n"
       "st1 { v16.h }[2], [x27], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
       "st1 { v16.b }[6], [x27]\n"
-      "st1 { v20.b }[6], [x22]\n"
+      "st1 { v20.b }[6], [x23]\n"
       "b 58f\n"
       "55:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x9, #0, 58f\n"
       "st1 { v16.b }[4], [x27]\n"
-      "st1 { v20.b }[4], [x22]\n"
+      "st1 { v20.b }[4], [x23]\n"
       "b 58f\n"
       "56:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x9, #1, 57f\n"
       "str h16, [x27], #0x2\n"
-      "str h20, [x22], #0x2\n"
+      "str h20, [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
       "st1 { v16.b }[2], [x27]\n"
-      "st1 { v20.b }[2], [x22]\n"
+      "st1 { v20.b }[2], [x23]\n"
       "b 58f\n"
       "57:"  // Height 2: Partial direct writeback: partial_1_0
       "str b16, [x27, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
       "58:"  // Height 2: Partial direct writeback: Done
       "b 60f\n"
       "59:"  // Height 2: Full writeback
       "str q16, [x27, #0x0]\n"
       "add x27, x27, #0x10\n"
-      "str q20, [x22, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
       "60:"  // Height 2: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 32b\n"
@@ -819,13 +818,13 @@
       "64:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 65f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
       "cbnz x26, 66f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -834,8 +833,8 @@
       "b 66f\n"
       "65:"  // Height 3: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "66:"  // Height 3: input setup done
       "cmp x25, #0x10\n"
       "blt 71f\n"
@@ -857,62 +856,62 @@
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q29, [x28, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
       "add x22, x22, #0x10\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
       ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q28, [x28, #0x80]\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
       ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q5, [x28, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q4, [x28, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
       ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q3, [x28, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q31, [x28, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
+      "ldr q30, [x28, #0xd0]\n"
+      ".inst 0x4fa0e3b3  // sdot v19.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3b7  // sdot v23.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3bb  // sdot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr q29, [x28, #0xe0]\n"
+      ".inst 0x4f80eb90  // sdot v16.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb94  // sdot v20.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb98  // sdot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr q28, [x28, #0xf0]\n"
+      ".inst 0x4f80e8b1  // sdot v17.4s, v5.16b, v0.4b[2]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x4f82e91b  // sdot v27.4s, v8.16b, v2.4b[2]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e938  // sdot v24.4s, v9.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e959  // sdot v25.4s, v10.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e89a  // sdot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8bb  // sdot v27.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x4f81e8b5  // sdot v21.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8b9  // sdot v25.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x4f80e892  // sdot v18.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x4f81e896  // sdot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4f82e89a  // sdot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x4f80e873  // sdot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x4f81e877  // sdot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x4f82e87b  // sdot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x4fa0ebf0  // sdot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebf4  // sdot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebf8  // sdot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebd1  // sdot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebd5  // sdot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebd9  // sdot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebb2  // sdot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebb6  // sdot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebba  // sdot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa0eb93  // sdot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb97  // sdot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb9b  // sdot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 68f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
@@ -940,63 +939,63 @@
       "sub x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q29, [x28, #0x70]\n"
       ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
       ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q28, [x28, #0x80]\n"
       "add x22, x22, #0x10\n"
       ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
       ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q5, [x28, #0x90]\n"
       ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q4, [x28, #0xa0]\n"
       ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
       ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q3, [x28, #0xb0]\n"
       ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q31, [x28, #0xc0]\n"
       ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
-      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
+      "ldr q30, [x28, #0xd0]\n"
+      ".inst 0x4fa0e3b3  // sdot v19.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3b7  // sdot v23.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3bb  // sdot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr q29, [x28, #0xe0]\n"
+      ".inst 0x4f80eb90  // sdot v16.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb94  // sdot v20.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb98  // sdot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr q28, [x28, #0xf0]\n"
+      ".inst 0x4f80e8b1  // sdot v17.4s, v5.16b, v0.4b[2]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x4f82e91b  // sdot v27.4s, v8.16b, v2.4b[2]\n"
-      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e938  // sdot v24.4s, v9.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e959  // sdot v25.4s, v10.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e89a  // sdot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8bb  // sdot v27.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x4f81e8b5  // sdot v21.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8b9  // sdot v25.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x4f80e892  // sdot v18.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x4f81e896  // sdot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4f82e89a  // sdot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x4f80e873  // sdot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x4f81e877  // sdot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x4f82e87b  // sdot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x4fa0ebf0  // sdot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebf4  // sdot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebf8  // sdot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebd1  // sdot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebd5  // sdot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebd9  // sdot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ebb2  // sdot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebb6  // sdot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebba  // sdot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa0eb93  // sdot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb97  // sdot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb9b  // sdot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 70f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
@@ -1018,25 +1017,25 @@
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "73:"  // Height 3: Multiply loop: unique 11: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      "ldr q7, [x28, #0x10]\n"
+      "ldr q31, [x28, #0x0]\n"
+      "ldr q30, [x28, #0x10]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x4\n"
-      "ldr q8, [x28, #0x20]\n"
-      "ldr q9, [x28, #0x30]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d8  // sdot v24.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q29, [x28, #0x20]\n"
+      "ldr q28, [x28, #0x30]\n"
+      ".inst 0x4f80e3f0  // sdot v16.4s, v31.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3f4  // sdot v20.4s, v31.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3f8  // sdot v24.4s, v31.16b, v2.4b[0]\n"
+      ".inst 0x4f80e3d1  // sdot v17.4s, v30.16b, v0.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f9  // sdot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x4f82e11a  // sdot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x4f82e13b  // sdot v27.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x4f81e3d5  // sdot v21.4s, v30.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3d9  // sdot v25.4s, v30.16b, v2.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3b6  // sdot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3ba  // sdot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e397  // sdot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e39b  // sdot v27.4s, v28.16b, v2.4b[0]\n"
       "bge 72b\n"
       "74:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x25, 78f\n"
@@ -1059,144 +1058,144 @@
       ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "77:"  // Height 3: Multiply loop: unique 12: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      ".inst 0x4f81e154  // sdot v20.4s, v10.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x20]\n"
-      "ldr q6, [x28, #0x30]\n"
-      ".inst 0x4f82e158  // sdot v24.4s, v10.16b, v2.4b[0]\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x4f81e095  // sdot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x4f82e099  // sdot v25.4s, v4.16b, v2.4b[0]\n"
+      "ldr q31, [x28, #0x0]\n"
+      "ldr q30, [x28, #0x10]\n"
+      ".inst 0x4f80e3f0  // sdot v16.4s, v31.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3f4  // sdot v20.4s, v31.16b, v1.4b[0]\n"
+      "ldr q29, [x28, #0x20]\n"
+      "ldr q28, [x28, #0x30]\n"
+      ".inst 0x4f82e3f8  // sdot v24.4s, v31.16b, v2.4b[0]\n"
+      ".inst 0x4f80e3d1  // sdot v17.4s, v30.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3d5  // sdot v21.4s, v30.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3d9  // sdot v25.4s, v30.16b, v2.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0ba  // sdot v26.4s, v5.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d7  // sdot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0db  // sdot v27.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3b6  // sdot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3ba  // sdot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e397  // sdot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e39b  // sdot v27.4s, v28.16b, v2.4b[0]\n"
       "78:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
       "cmp x26, x20\n"
       "bne 64b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x27, x20\n"
-      "add x21, x22, x20\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
       "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "tbnz %x[flags], #31, 79f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v3.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "neg v3.4s, v3.4s\n"
+      "neg v28.4s, v28.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "mul v11.4s, v11.4s, v3.4s\n"
-      "mul v12.4s, v12.4s, v3.4s\n"
-      "mul v13.4s, v13.4s, v3.4s\n"
+      "mul v11.4s, v11.4s, v28.4s\n"
+      "mul v12.4s, v12.4s, v28.4s\n"
+      "mul v13.4s, v13.4s, v28.4s\n"
       "79:"  // Height 3: skip row sum fixup
       "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q31, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q30, [x10, #0x20]\n"
+      "ldr q29, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add v24.4s, v24.4s, v13.4s\n"
       "add v25.4s, v25.4s, v13.4s\n"
       "add x10, x10, #0x40\n"
       "add v26.4s, v26.4s, v13.4s\n"
       "add v27.4s, v27.4s, v13.4s\n"
       "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v31.4s\n"
+      "add v18.4s, v18.4s, v30.4s\n"
+      "add v19.4s, v19.4s, v29.4s\n"
       "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v31.4s\n"
+      "add v22.4s, v22.4s, v30.4s\n"
+      "add v23.4s, v23.4s, v29.4s\n"
       "add v24.4s, v24.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v31.4s\n"
+      "add v26.4s, v26.4s, v30.4s\n"
+      "add v27.4s, v27.4s, v29.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v28.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v28.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v28.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v28.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v28.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v28.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v28.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v28.4s\n"
       "tbz %x[flags], #5, 80f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "and v1.16b, v16.16b, v0.16b\n"
+      "and v31.16b, v17.16b, v0.16b\n"
+      "and v30.16b, v18.16b, v0.16b\n"
+      "and v29.16b, v19.16b, v0.16b\n"
+      "and v28.16b, v20.16b, v0.16b\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v1.4s\n"
+      "sqadd v17.4s, v17.4s, v31.4s\n"
+      "sqadd v18.4s, v18.4s, v30.4s\n"
+      "sqadd v19.4s, v19.4s, v29.4s\n"
+      "sqadd v20.4s, v20.4s, v28.4s\n"
+      "and v3.16b, v21.16b, v0.16b\n"
+      "and v2.16b, v22.16b, v0.16b\n"
+      "and v1.16b, v23.16b, v0.16b\n"
+      "and v31.16b, v24.16b, v0.16b\n"
+      "and v30.16b, v25.16b, v0.16b\n"
+      "and v29.16b, v26.16b, v0.16b\n"
+      "and v28.16b, v27.16b, v0.16b\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v3.4s\n"
+      "sqadd v22.4s, v22.4s, v2.4s\n"
+      "sqadd v23.4s, v23.4s, v1.4s\n"
+      "sqadd v24.4s, v24.4s, v31.4s\n"
+      "sqadd v25.4s, v25.4s, v30.4s\n"
+      "sqadd v26.4s, v26.4s, v29.4s\n"
+      "sqadd v27.4s, v27.4s, v28.4s\n"
       "80:"  // Height 3: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v30.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v29.4s }, [x20]\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
       "cmp x9, #0x10\n"
@@ -1204,132 +1203,132 @@
       "srshl v25.4s, v25.4s, v0.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v16.4s, v16.4s, v30.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v30.4s\n"
+      "add v19.4s, v19.4s, v30.4s\n"
+      "add v20.4s, v20.4s, v30.4s\n"
+      "add v21.4s, v21.4s, v30.4s\n"
+      "add v22.4s, v22.4s, v30.4s\n"
+      "add v23.4s, v23.4s, v30.4s\n"
+      "add v24.4s, v24.4s, v30.4s\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v30.4s\n"
+      "add v27.4s, v27.4s, v30.4s\n"
+      "smin v16.4s, v16.4s, v29.4s\n"
+      "smin v17.4s, v17.4s, v29.4s\n"
+      "smin v18.4s, v18.4s, v29.4s\n"
+      "smin v19.4s, v19.4s, v29.4s\n"
+      "smin v20.4s, v20.4s, v29.4s\n"
+      "smin v21.4s, v21.4s, v29.4s\n"
+      "smin v22.4s, v22.4s, v29.4s\n"
+      "smin v23.4s, v23.4s, v29.4s\n"
+      "smin v24.4s, v24.4s, v29.4s\n"
+      "smin v25.4s, v25.4s, v29.4s\n"
+      "smin v26.4s, v26.4s, v29.4s\n"
+      "smin v27.4s, v27.4s, v29.4s\n"
+      "smax v16.4s, v16.4s, v28.4s\n"
+      "smax v17.4s, v17.4s, v28.4s\n"
+      "smax v18.4s, v18.4s, v28.4s\n"
+      "smax v19.4s, v19.4s, v28.4s\n"
+      "smax v20.4s, v20.4s, v28.4s\n"
+      "smax v21.4s, v21.4s, v28.4s\n"
+      "smax v22.4s, v22.4s, v28.4s\n"
+      "smax v23.4s, v23.4s, v28.4s\n"
+      "smax v24.4s, v24.4s, v28.4s\n"
+      "smax v25.4s, v25.4s, v28.4s\n"
+      "smax v26.4s, v26.4s, v28.4s\n"
+      "smax v27.4s, v27.4s, v28.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v18.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v20.16b, v20.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 89f\n"
       "tbz x9, #3, 84f\n"
       "str d16, [x27], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
       "tbz x9, #2, 82f\n"
       "st1 { v16.s }[2], [x27], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
       "tbz x9, #1, 81f\n"
       "st1 { v16.h }[6], [x27], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
       "st1 { v16.b }[14], [x27]\n"
-      "st1 { v20.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 88f\n"
       "81:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x9, #0, 88f\n"
       "st1 { v16.b }[12], [x27]\n"
-      "st1 { v20.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 88f\n"
       "82:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x9, #1, 83f\n"
       "st1 { v16.h }[4], [x27], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
       "st1 { v16.b }[10], [x27]\n"
-      "st1 { v20.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 88f\n"
       "83:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x9, #0, 88f\n"
       "st1 { v16.b }[8], [x27]\n"
-      "st1 { v20.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 88f\n"
       "84:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x9, #2, 86f\n"
       "str s16, [x27], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
       "tbz x9, #1, 85f\n"
       "st1 { v16.h }[2], [x27], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
       "st1 { v16.b }[6], [x27]\n"
-      "st1 { v20.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 88f\n"
       "85:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x9, #0, 88f\n"
       "st1 { v16.b }[4], [x27]\n"
-      "st1 { v20.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 88f\n"
       "86:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x9, #1, 87f\n"
       "str h16, [x27], #0x2\n"
-      "str h20, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
       "st1 { v16.b }[2], [x27]\n"
-      "st1 { v20.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 88f\n"
       "87:"  // Height 3: Partial direct writeback: partial_1_0
       "str b16, [x27, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "88:"  // Height 3: Partial direct writeback: Done
       "b 90f\n"
       "89:"  // Height 3: Full writeback
       "str q16, [x27, #0x0]\n"
       "add x27, x27, #0x10\n"
-      "str q20, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "90:"  // Height 3: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 62b\n"
@@ -1370,14 +1369,14 @@
       "94:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 95f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
       "cbnz x26, 96f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -1387,9 +1386,9 @@
       "b 96f\n"
       "95:"  // Height 4: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "96:"  // Height 4: input setup done
       "cmp x25, #0x10\n"
       "blt 101f\n"
@@ -1614,29 +1613,29 @@
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
       "103:"  // Height 4: Multiply loop: unique 15: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      "ldr q7, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x4\n"
-      "ldr q8, [x28, #0x20]\n"
-      "ldr q9, [x28, #0x30]\n"
-      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d8  // sdot v24.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0dc  // sdot v28.4s, v6.16b, v3.4b[0]\n"
+      "ldr q5, [x28, #0x20]\n"
+      "ldr q4, [x28, #0x30]\n"
+      ".inst 0x4f80e0f0  // sdot v16.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0f4  // sdot v20.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f8  // sdot v24.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0fc  // sdot v28.4s, v7.16b, v3.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f9  // sdot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0fd  // sdot v29.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x4f82e11a  // sdot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x4f83e11e  // sdot v30.4s, v8.16b, v3.4b[0]\n"
-      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x4f82e13b  // sdot v27.4s, v9.16b, v2.4b[0]\n"
-      ".inst 0x4f83e13f  // sdot v31.4s, v9.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d9  // sdot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0dd  // sdot v29.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0ba  // sdot v26.4s, v5.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0be  // sdot v30.4s, v5.16b, v3.4b[0]\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4f81e097  // sdot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4f82e09b  // sdot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x4f83e09f  // sdot v31.4s, v4.16b, v3.4b[0]\n"
       "bge 102b\n"
       "104:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x25, 108f\n"
@@ -1663,73 +1662,73 @@
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
       "107:"  // Height 4: Multiply loop: unique 16: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x4f80e150  // sdot v16.4s, v10.16b, v0.4b[0]\n"
-      ".inst 0x4f81e154  // sdot v20.4s, v10.16b, v1.4b[0]\n"
+      "ldr q7, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      ".inst 0x4f80e0f0  // sdot v16.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0f4  // sdot v20.4s, v7.16b, v1.4b[0]\n"
       "ldr q5, [x28, #0x20]\n"
-      "ldr q6, [x28, #0x30]\n"
-      ".inst 0x4f82e158  // sdot v24.4s, v10.16b, v2.4b[0]\n"
-      ".inst 0x4f83e15c  // sdot v28.4s, v10.16b, v3.4b[0]\n"
-      ".inst 0x4f80e091  // sdot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x4f81e095  // sdot v21.4s, v4.16b, v1.4b[0]\n"
+      "ldr q4, [x28, #0x30]\n"
+      ".inst 0x4f82e0f8  // sdot v24.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0fc  // sdot v28.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x4f82e099  // sdot v25.4s, v4.16b, v2.4b[0]\n"
-      ".inst 0x4f83e09d  // sdot v29.4s, v4.16b, v3.4b[0]\n"
+      ".inst 0x4f82e0d9  // sdot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0dd  // sdot v29.4s, v6.16b, v3.4b[0]\n"
       ".inst 0x4f80e0b2  // sdot v18.4s, v5.16b, v0.4b[0]\n"
       ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
       ".inst 0x4f82e0ba  // sdot v26.4s, v5.16b, v2.4b[0]\n"
       ".inst 0x4f83e0be  // sdot v30.4s, v5.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0d3  // sdot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0d7  // sdot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0db  // sdot v27.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0df  // sdot v31.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4f81e097  // sdot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4f82e09b  // sdot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x4f83e09f  // sdot v31.4s, v4.16b, v3.4b[0]\n"
       "108:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
       "cmp x26, x20\n"
       "bne 94b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x27, x20\n"
-      "add x21, x22, x20\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
       "prfm pstl1keep, [x27, #0x0]\n"
-      "add x20, x21, x20\n"
+      "add x21, x22, x20\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "prfm pstl1keep, [x22, #0x0]\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
       "tbnz %x[flags], #31, 109f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "neg v4.4s, v4.4s\n"
+      "neg v0.4s, v0.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "mul v11.4s, v11.4s, v4.4s\n"
-      "mul v12.4s, v12.4s, v4.4s\n"
-      "mul v13.4s, v13.4s, v4.4s\n"
-      "mul v14.4s, v14.4s, v4.4s\n"
+      "mul v11.4s, v11.4s, v0.4s\n"
+      "mul v12.4s, v12.4s, v0.4s\n"
+      "mul v13.4s, v13.4s, v0.4s\n"
+      "mul v14.4s, v14.4s, v0.4s\n"
       "109:"  // Height 4: skip row sum fixup
       "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q4, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q3, [x10, #0x20]\n"
+      "ldr q2, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add v24.4s, v24.4s, v13.4s\n"
       "add v25.4s, v25.4s, v13.4s\n"
       "add x10, x10, #0x40\n"
@@ -1740,100 +1739,100 @@
       "add v30.4s, v30.4s, v14.4s\n"
       "add v31.4s, v31.4s, v14.4s\n"
       "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v2.4s\n"
       "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v2.4s\n"
       "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v2.4s\n"
       "add v28.4s, v28.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v29.4s, v29.4s, v1.4s\n"
-      "add v30.4s, v30.4s, v2.4s\n"
-      "add v31.4s, v31.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
-      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
-      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
-      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
-      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v31.4s, v31.4s, v2.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v1.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v1.4s\n"
       "tbz %x[flags], #5, 110f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v2.16b, v16.16b, v0.16b\n"
+      "and v1.16b, v17.16b, v0.16b\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v2.4s\n"
+      "sqadd v17.4s, v17.4s, v1.4s\n"
+      "and v7.16b, v18.16b, v0.16b\n"
+      "and v6.16b, v19.16b, v0.16b\n"
+      "and v5.16b, v20.16b, v0.16b\n"
+      "and v4.16b, v21.16b, v0.16b\n"
+      "and v3.16b, v22.16b, v0.16b\n"
+      "and v2.16b, v23.16b, v0.16b\n"
+      "and v1.16b, v24.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "and v9.16b, v28.16b, v0.16b\n"
-      "and v10.16b, v29.16b, v0.16b\n"
-      "and v4.16b, v30.16b, v0.16b\n"
-      "and v5.16b, v31.16b, v0.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
-      "sqadd v28.4s, v28.4s, v9.4s\n"
-      "sqadd v29.4s, v29.4s, v10.4s\n"
-      "sqadd v30.4s, v30.4s, v4.4s\n"
-      "sqadd v31.4s, v31.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v7.4s\n"
+      "sqadd v19.4s, v19.4s, v6.4s\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v4.4s\n"
+      "sqadd v22.4s, v22.4s, v3.4s\n"
+      "sqadd v23.4s, v23.4s, v2.4s\n"
+      "sqadd v24.4s, v24.4s, v1.4s\n"
+      "and v7.16b, v25.16b, v0.16b\n"
+      "and v6.16b, v26.16b, v0.16b\n"
+      "and v5.16b, v27.16b, v0.16b\n"
+      "and v4.16b, v28.16b, v0.16b\n"
+      "and v3.16b, v29.16b, v0.16b\n"
+      "and v2.16b, v30.16b, v0.16b\n"
+      "and v1.16b, v31.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v7.4s\n"
+      "sqadd v26.4s, v26.4s, v6.4s\n"
+      "sqadd v27.4s, v27.4s, v5.4s\n"
+      "sqadd v28.4s, v28.4s, v4.4s\n"
+      "sqadd v29.4s, v29.4s, v3.4s\n"
+      "sqadd v30.4s, v30.4s, v2.4s\n"
+      "sqadd v31.4s, v31.4s, v1.4s\n"
       "110:"  // Height 4: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v3.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v2.4s }, [x20]\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
       "cmp x9, #0x10\n"
@@ -1845,163 +1844,163 @@
       "srshl v29.4s, v29.4s, v0.4s\n"
       "srshl v30.4s, v30.4s, v0.4s\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
+      "add v16.4s, v16.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v3.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v3.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v3.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v3.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v31.4s, v31.4s, v3.4s\n"
+      "smin v16.4s, v16.4s, v2.4s\n"
+      "smin v17.4s, v17.4s, v2.4s\n"
+      "smin v18.4s, v18.4s, v2.4s\n"
+      "smin v19.4s, v19.4s, v2.4s\n"
+      "smin v20.4s, v20.4s, v2.4s\n"
+      "smin v21.4s, v21.4s, v2.4s\n"
+      "smin v22.4s, v22.4s, v2.4s\n"
+      "smin v23.4s, v23.4s, v2.4s\n"
+      "smin v24.4s, v24.4s, v2.4s\n"
+      "smin v25.4s, v25.4s, v2.4s\n"
+      "smin v26.4s, v26.4s, v2.4s\n"
+      "smin v27.4s, v27.4s, v2.4s\n"
+      "smin v28.4s, v28.4s, v2.4s\n"
+      "smin v29.4s, v29.4s, v2.4s\n"
+      "smin v30.4s, v30.4s, v2.4s\n"
+      "smin v31.4s, v31.4s, v2.4s\n"
+      "smax v16.4s, v16.4s, v1.4s\n"
+      "smax v17.4s, v17.4s, v1.4s\n"
+      "smax v18.4s, v18.4s, v1.4s\n"
+      "smax v19.4s, v19.4s, v1.4s\n"
+      "smax v20.4s, v20.4s, v1.4s\n"
+      "smax v21.4s, v21.4s, v1.4s\n"
+      "smax v22.4s, v22.4s, v1.4s\n"
+      "smax v23.4s, v23.4s, v1.4s\n"
+      "smax v24.4s, v24.4s, v1.4s\n"
+      "smax v25.4s, v25.4s, v1.4s\n"
+      "smax v26.4s, v26.4s, v1.4s\n"
+      "smax v27.4s, v27.4s, v1.4s\n"
+      "smax v28.4s, v28.4s, v1.4s\n"
+      "smax v29.4s, v29.4s, v1.4s\n"
+      "smax v30.4s, v30.4s, v1.4s\n"
+      "smax v31.4s, v31.4s, v1.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v0.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v19.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v18.8h, v26.8h, v27.8h\n"
       "uzp1 v28.8h, v28.8h, v29.8h\n"
-      "uzp1 v29.8h, v30.8h, v31.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
-      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "uzp1 v17.8h, v30.8h, v31.8h\n"
+      "uzp1 v16.16b, v16.16b, v0.16b\n"
+      "uzp1 v20.16b, v20.16b, v19.16b\n"
+      "uzp1 v24.16b, v24.16b, v18.16b\n"
+      "uzp1 v28.16b, v28.16b, v17.16b\n"
       "bge 119f\n"
       "tbz x9, #3, 114f\n"
       "str d16, [x27], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
-      "str d28, [x20], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
       "tbz x9, #2, 112f\n"
       "st1 { v16.s }[2], [x27], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
-      "st1 { v28.s }[2], [x20], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
       "tbz x9, #1, 111f\n"
       "st1 { v16.h }[6], [x27], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
-      "st1 { v28.h }[6], [x20], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
       "st1 { v16.b }[14], [x27]\n"
-      "st1 { v20.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
-      "st1 { v28.b }[14], [x20]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "st1 { v28.b }[14], [x21]\n"
       "b 118f\n"
       "111:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x9, #0, 118f\n"
       "st1 { v16.b }[12], [x27]\n"
-      "st1 { v20.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
-      "st1 { v28.b }[12], [x20]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "st1 { v28.b }[12], [x21]\n"
       "b 118f\n"
       "112:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x9, #1, 113f\n"
       "st1 { v16.h }[4], [x27], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
-      "st1 { v28.h }[4], [x20], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
       "st1 { v16.b }[10], [x27]\n"
-      "st1 { v20.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
-      "st1 { v28.b }[10], [x20]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "st1 { v28.b }[10], [x21]\n"
       "b 118f\n"
       "113:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x9, #0, 118f\n"
       "st1 { v16.b }[8], [x27]\n"
-      "st1 { v20.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
-      "st1 { v28.b }[8], [x20]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "st1 { v28.b }[8], [x21]\n"
       "b 118f\n"
       "114:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x9, #2, 116f\n"
       "str s16, [x27], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
-      "str s28, [x20], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "str s28, [x21], #0x4\n"
       "tbz x9, #1, 115f\n"
       "st1 { v16.h }[2], [x27], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
-      "st1 { v28.h }[2], [x20], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
       "st1 { v16.b }[6], [x27]\n"
-      "st1 { v20.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
-      "st1 { v28.b }[6], [x20]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "st1 { v28.b }[6], [x21]\n"
       "b 118f\n"
       "115:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x9, #0, 118f\n"
       "st1 { v16.b }[4], [x27]\n"
-      "st1 { v20.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
-      "st1 { v28.b }[4], [x20]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "st1 { v28.b }[4], [x21]\n"
       "b 118f\n"
       "116:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x9, #1, 117f\n"
       "str h16, [x27], #0x2\n"
-      "str h20, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
-      "str h28, [x20], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "str h28, [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
       "st1 { v16.b }[2], [x27]\n"
-      "st1 { v20.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
-      "st1 { v28.b }[2], [x20]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "st1 { v28.b }[2], [x21]\n"
       "b 118f\n"
       "117:"  // Height 4: Partial direct writeback: partial_1_0
       "str b16, [x27, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
-      "str b28, [x20, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
       "118:"  // Height 4: Partial direct writeback: Done
       "b 120f\n"
       "119:"  // Height 4: Full writeback
       "str q16, [x27, #0x0]\n"
       "add x27, x27, #0x10\n"
-      "str q20, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
-      "str q28, [x20, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
       "120:"  // Height 4: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 92b\n"
@@ -2017,7 +2016,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "122:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
index 69ea87b..55ea68d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -98,5 +98,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp
index 69d01a2..883bd5a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp
@@ -78,7 +78,6 @@
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x4\n"
       "bge 97f\n"
@@ -106,11 +105,11 @@
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
       "cbnz x26, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -131,35 +130,35 @@
       "ldr q4, [x28, #0x60]\n"
       "blt 9f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v0.2d, v1.2d, v27.2d\n"
       ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0x70]\n"
-      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q25, [x28, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v27.2d\n"
       ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x28, #0x80]\n"
+      "ldr q24, [x28, #0x80]\n"
       ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x28, #0x90]\n"
+      "ldr q30, [x28, #0x90]\n"
       ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
-      "ldr q8, [x28, #0xa0]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
-      "ldr q9, [x28, #0xb0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
-      "ldr q10, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
-      "ldr q4, [x28, #0xd0]\n"
-      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0xe0]\n"
-      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x28, #0xf0]\n"
-      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x4e99a417  // smmla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x4e98a430  // smmla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
+      ".inst 0x4e9ea434  // smmla v20.4s, v1.16b, v30.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e9da431  // smmla v17.4s, v1.16b, v29.16b\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
-      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
-      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
-      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
-      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e9ca435  // smmla v21.4s, v1.16b, v28.16b\n"
+      ".inst 0x4e9ba432  // smmla v18.4s, v1.16b, v27.16b\n"
+      ".inst 0x4e9aa436  // smmla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e99a433  // smmla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e98a437  // smmla v23.4s, v1.16b, v24.16b\n"
       "tbnz %x[flags], #31, 8f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
@@ -177,36 +176,36 @@
       "prfm pldl1keep, [x24, #0x80]\n"
       "bge 7b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v0.2d, v1.2d, v24.2d\n"
       ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0x70]\n"
-      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q25, [x28, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v24.2d\n"
       ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x28, #0x80]\n"
+      "ldr q24, [x28, #0x80]\n"
       ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x28, #0x90]\n"
+      "ldr q30, [x28, #0x90]\n"
       ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
-      "ldr q8, [x28, #0xa0]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
-      "ldr q9, [x28, #0xb0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
-      "ldr q10, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
-      "ldr q4, [x28, #0xd0]\n"
-      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0xe0]\n"
-      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x28, #0xf0]\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x4e99a417  // smmla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x4e98a430  // smmla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
       "sub x25, x25, #0x10\n"
-      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e9ea434  // smmla v20.4s, v1.16b, v30.16b\n"
+      ".inst 0x4e9da431  // smmla v17.4s, v1.16b, v29.16b\n"
       "add x24, x24, #0x10\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
-      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
-      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
-      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
-      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e9ca435  // smmla v21.4s, v1.16b, v28.16b\n"
+      ".inst 0x4e9ba432  // smmla v18.4s, v1.16b, v27.16b\n"
+      ".inst 0x4e9aa436  // smmla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e99a433  // smmla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e98a437  // smmla v23.4s, v1.16b, v24.16b\n"
       "tbnz %x[flags], #31, 10f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
@@ -217,29 +216,29 @@
       "cmp x25, #0x8\n"
       "blt 14f\n"
       "12:"  // Height 1: Multiply loop: Odd block loop
-      "ldr d1, [x24], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d25, [x24], #0x8\n"
+      "trn1 v0.2d, v25.2d, v24.2d\n"
       "tbnz %x[flags], #31, 13f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "13:"  // Height 1: Multiply loop: unique 3: skip row sum
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      ".inst 0x4e88a410  // smmla v16.4s, v0.16b, v8.16b\n"
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      ".inst 0x4e98a410  // smmla v16.4s, v0.16b, v24.16b\n"
       "sub x25, x25, #0x8\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q4, [x28, #0x30]\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
       "cmp x25, #0x8\n"
-      ".inst 0x4e89a414  // smmla v20.4s, v0.16b, v9.16b\n"
-      "ldr q5, [x28, #0x40]\n"
-      "ldr q6, [x28, #0x50]\n"
-      ".inst 0x4e8aa411  // smmla v17.4s, v0.16b, v10.16b\n"
-      ".inst 0x4e84a415  // smmla v21.4s, v0.16b, v4.16b\n"
-      "ldr q7, [x28, #0x60]\n"
-      "ldr q8, [x28, #0x70]\n"
-      ".inst 0x4e85a412  // smmla v18.4s, v0.16b, v5.16b\n"
-      ".inst 0x4e86a416  // smmla v22.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a413  // smmla v19.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e88a417  // smmla v23.4s, v0.16b, v8.16b\n"
+      ".inst 0x4e9aa414  // smmla v20.4s, v0.16b, v26.16b\n"
+      "ldr q27, [x28, #0x40]\n"
+      "ldr q26, [x28, #0x50]\n"
+      ".inst 0x4e99a411  // smmla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a415  // smmla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x4e9ba412  // smmla v18.4s, v0.16b, v27.16b\n"
+      ".inst 0x4e9aa416  // smmla v22.4s, v0.16b, v26.16b\n"
+      ".inst 0x4e99a413  // smmla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a417  // smmla v23.4s, v0.16b, v24.16b\n"
       "add x28, x28, #0x80\n"
       "bge 12b\n"
       "14:"  // Height 1: Multiply loop: Skip odd blocks
@@ -264,26 +263,26 @@
       "17:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr b1, [x24, #0x0]\n"
       "18:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v0.2d, v1.2d, v24.2d\n"
       "tbnz %x[flags], #31, 19f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "19:"  // Height 1: Multiply loop: unique 4: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x4e8aa410  // smmla v16.4s, v0.16b, v10.16b\n"
-      ".inst 0x4e84a414  // smmla v20.4s, v0.16b, v4.16b\n"
-      "ldr q5, [x28, #0x20]\n"
-      "ldr q6, [x28, #0x30]\n"
-      ".inst 0x4e85a411  // smmla v17.4s, v0.16b, v5.16b\n"
-      ".inst 0x4e86a415  // smmla v21.4s, v0.16b, v6.16b\n"
-      "ldr q7, [x28, #0x40]\n"
-      "ldr q8, [x28, #0x50]\n"
-      ".inst 0x4e87a412  // smmla v18.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e88a416  // smmla v22.4s, v0.16b, v8.16b\n"
-      "ldr q9, [x28, #0x60]\n"
-      "ldr q10, [x28, #0x70]\n"
-      ".inst 0x4e89a413  // smmla v19.4s, v0.16b, v9.16b\n"
-      ".inst 0x4e8aa417  // smmla v23.4s, v0.16b, v10.16b\n"
+      "ldr q25, [x28, #0x0]\n"
+      "ldr q24, [x28, #0x10]\n"
+      ".inst 0x4e99a410  // smmla v16.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a414  // smmla v20.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x4e99a411  // smmla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a415  // smmla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x40]\n"
+      "ldr q24, [x28, #0x50]\n"
+      ".inst 0x4e99a412  // smmla v18.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a416  // smmla v22.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x4e99a413  // smmla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a417  // smmla v23.4s, v0.16b, v24.16b\n"
       "add x28, x28, #0x80\n"
       "20:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -297,75 +296,75 @@
       "uzp1 v19.2d, v19.2d, v23.2d\n"
       "mov v23.16b, v16.16b\n"
       "tbnz %x[flags], #31, 21f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v1.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v16.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "neg v1.4s, v1.4s\n"
+      "neg v16.4s, v16.4s\n"
       "dup v11.4s, v11.s[0]\n"
-      "mul v11.4s, v11.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v16.4s\n"
       "21:"  // Height 1: skip row sum fixup
-      "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q24, [x10, #0x0]\n"
+      "ldr q22, [x10, #0x10]\n"
       "add v23.4s, v23.4s, v11.4s\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q21, [x10, #0x20]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v16.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add v23.4s, v23.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v16.4s\n"
       "add x10, x10, #0x40\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v16.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v16.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v16.4s\n"
       "tbz %x[flags], #5, 22f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v22.16b, v23.16b, v0.16b\n"
+      "and v21.16b, v17.16b, v0.16b\n"
+      "and v20.16b, v18.16b, v0.16b\n"
+      "and v16.16b, v19.16b, v0.16b\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v22.4s\n"
+      "sqadd v17.4s, v17.4s, v21.4s\n"
+      "sqadd v18.4s, v18.4s, v20.4s\n"
+      "sqadd v19.4s, v19.4s, v16.4s\n"
       "22:"  // Height 1: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v21.4s }, [x20]\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "add v23.4s, v23.4s, v21.4s\n"
+      "add v17.4s, v17.4s, v21.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add v19.4s, v19.4s, v21.4s\n"
       "cmp x9, #0x10\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "smin v23.4s, v23.4s, v20.4s\n"
+      "smin v17.4s, v17.4s, v20.4s\n"
+      "smin v18.4s, v18.4s, v20.4s\n"
+      "smin v19.4s, v19.4s, v20.4s\n"
+      "smax v23.4s, v23.4s, v16.4s\n"
+      "smax v17.4s, v17.4s, v16.4s\n"
+      "smax v18.4s, v18.4s, v16.4s\n"
+      "smax v19.4s, v19.4s, v16.4s\n"
       "uzp1 v23.8h, v23.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "uzp1 v23.16b, v23.16b, v17.16b\n"
+      "uzp1 v16.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.16b, v23.16b, v16.16b\n"
       "bge 31f\n"
       "tbz x9, #3, 26f\n"
       "str d23, [x27], #0x8\n"
@@ -442,12 +441,12 @@
       "36:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 37f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
       "cbnz x26, 38f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -455,7 +454,7 @@
       "b 38f\n"
       "37:"  // Height 2: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
+      "add x23, x24, x21\n"
       "38:"  // Height 2: input setup done
       "cmp x25, #0x10\n"
       "blt 43f\n"
@@ -473,34 +472,34 @@
       "39:"  // Height 2: Multiply loop: Main loop head
       "trn1 v0.2d, v1.2d, v2.2d\n"
       ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0x70]\n"
+      "ldr q25, [x28, #0x70]\n"
       ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x28, #0x80]\n"
+      "ldr q24, [x28, #0x80]\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
       ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x28, #0x90]\n"
+      "ldr q30, [x28, #0x90]\n"
       ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
-      "ldr q8, [x28, #0xa0]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
-      "ldr q9, [x28, #0xb0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
-      "ldr q10, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
-      "ldr q4, [x28, #0xd0]\n"
-      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0xe0]\n"
-      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x28, #0xf0]\n"
-      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x4e99a417  // smmla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x4e98a430  // smmla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
+      ".inst 0x4e9ea434  // smmla v20.4s, v1.16b, v30.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e9da431  // smmla v17.4s, v1.16b, v29.16b\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x4e9ca435  // smmla v21.4s, v1.16b, v28.16b\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
-      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
-      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
-      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e9ba432  // smmla v18.4s, v1.16b, v27.16b\n"
+      ".inst 0x4e9aa436  // smmla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e99a433  // smmla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e98a437  // smmla v23.4s, v1.16b, v24.16b\n"
       "tbnz %x[flags], #31, 40f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
@@ -522,35 +521,35 @@
       "41:"  // Height 2: Multiply loop: Single iteration only
       "trn1 v0.2d, v1.2d, v2.2d\n"
       ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0x70]\n"
+      "ldr q25, [x28, #0x70]\n"
       ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x28, #0x80]\n"
+      "ldr q24, [x28, #0x80]\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
       ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x28, #0x90]\n"
+      "ldr q30, [x28, #0x90]\n"
       ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
-      "ldr q8, [x28, #0xa0]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
-      "ldr q9, [x28, #0xb0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
-      "ldr q10, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
-      "ldr q4, [x28, #0xd0]\n"
-      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0xe0]\n"
-      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x28, #0xf0]\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x4e99a417  // smmla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x4e98a430  // smmla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
       "sub x25, x25, #0x10\n"
-      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e9ea434  // smmla v20.4s, v1.16b, v30.16b\n"
+      ".inst 0x4e9da431  // smmla v17.4s, v1.16b, v29.16b\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
-      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x4e9ca435  // smmla v21.4s, v1.16b, v28.16b\n"
+      ".inst 0x4e9ba432  // smmla v18.4s, v1.16b, v27.16b\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
-      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
-      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e9aa436  // smmla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e99a433  // smmla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e98a437  // smmla v23.4s, v1.16b, v24.16b\n"
       "tbnz %x[flags], #31, 42f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
@@ -562,30 +561,30 @@
       "cmp x25, #0x8\n"
       "blt 46f\n"
       "44:"  // Height 2: Multiply loop: Odd block loop
-      "ldr d1, [x24], #0x8\n"
-      "ldr d2, [x23], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "trn1 v0.2d, v25.2d, v24.2d\n"
       "tbnz %x[flags], #31, 45f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "45:"  // Height 2: Multiply loop: unique 7: skip row sum
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      ".inst 0x4e88a410  // smmla v16.4s, v0.16b, v8.16b\n"
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      ".inst 0x4e98a410  // smmla v16.4s, v0.16b, v24.16b\n"
       "sub x25, x25, #0x8\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q4, [x28, #0x30]\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
       "cmp x25, #0x8\n"
-      ".inst 0x4e89a414  // smmla v20.4s, v0.16b, v9.16b\n"
-      "ldr q5, [x28, #0x40]\n"
-      "ldr q6, [x28, #0x50]\n"
-      ".inst 0x4e8aa411  // smmla v17.4s, v0.16b, v10.16b\n"
-      ".inst 0x4e84a415  // smmla v21.4s, v0.16b, v4.16b\n"
-      "ldr q7, [x28, #0x60]\n"
-      "ldr q8, [x28, #0x70]\n"
-      ".inst 0x4e85a412  // smmla v18.4s, v0.16b, v5.16b\n"
-      ".inst 0x4e86a416  // smmla v22.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a413  // smmla v19.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e88a417  // smmla v23.4s, v0.16b, v8.16b\n"
+      ".inst 0x4e9aa414  // smmla v20.4s, v0.16b, v26.16b\n"
+      "ldr q27, [x28, #0x40]\n"
+      "ldr q26, [x28, #0x50]\n"
+      ".inst 0x4e99a411  // smmla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a415  // smmla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x4e9ba412  // smmla v18.4s, v0.16b, v27.16b\n"
+      ".inst 0x4e9aa416  // smmla v22.4s, v0.16b, v26.16b\n"
+      ".inst 0x4e99a413  // smmla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a417  // smmla v23.4s, v0.16b, v24.16b\n"
       "add x28, x28, #0x80\n"
       "bge 44b\n"
       "46:"  // Height 2: Multiply loop: Skip odd blocks
@@ -621,22 +620,22 @@
       "tbnz %x[flags], #31, 51f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       "51:"  // Height 2: Multiply loop: unique 8: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x4e8aa410  // smmla v16.4s, v0.16b, v10.16b\n"
-      ".inst 0x4e84a414  // smmla v20.4s, v0.16b, v4.16b\n"
-      "ldr q5, [x28, #0x20]\n"
-      "ldr q6, [x28, #0x30]\n"
-      ".inst 0x4e85a411  // smmla v17.4s, v0.16b, v5.16b\n"
-      ".inst 0x4e86a415  // smmla v21.4s, v0.16b, v6.16b\n"
-      "ldr q7, [x28, #0x40]\n"
-      "ldr q8, [x28, #0x50]\n"
-      ".inst 0x4e87a412  // smmla v18.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e88a416  // smmla v22.4s, v0.16b, v8.16b\n"
-      "ldr q9, [x28, #0x60]\n"
-      "ldr q10, [x28, #0x70]\n"
-      ".inst 0x4e89a413  // smmla v19.4s, v0.16b, v9.16b\n"
-      ".inst 0x4e8aa417  // smmla v23.4s, v0.16b, v10.16b\n"
+      "ldr q25, [x28, #0x0]\n"
+      "ldr q24, [x28, #0x10]\n"
+      ".inst 0x4e99a410  // smmla v16.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a414  // smmla v20.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x4e99a411  // smmla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a415  // smmla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x40]\n"
+      "ldr q24, [x28, #0x50]\n"
+      ".inst 0x4e99a412  // smmla v18.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a416  // smmla v22.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x4e99a413  // smmla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x4e98a417  // smmla v23.4s, v0.16b, v24.16b\n"
       "add x28, x28, #0x80\n"
       "52:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -644,127 +643,127 @@
       "cmp x26, x20\n"
       "bne 36b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "uzp1 v4.2d, v16.2d, v20.2d\n"
-      "add x22, x27, x20\n"
+      "uzp1 v24.2d, v16.2d, v20.2d\n"
+      "add x23, x27, x20\n"
       "uzp2 v16.2d, v16.2d, v20.2d\n"
       "uzp1 v20.2d, v17.2d, v21.2d\n"
       "uzp2 v17.2d, v17.2d, v21.2d\n"
       "prfm pstl1keep, [x27, #0x0]\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "uzp1 v21.2d, v18.2d, v22.2d\n"
       "uzp2 v18.2d, v18.2d, v22.2d\n"
       "uzp1 v22.2d, v19.2d, v23.2d\n"
       "uzp2 v19.2d, v19.2d, v23.2d\n"
-      "mov v23.16b, v4.16b\n"
+      "mov v23.16b, v24.16b\n"
       "tbnz %x[flags], #31, 53f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v2.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "neg v2.4s, v2.4s\n"
+      "neg v24.4s, v24.4s\n"
       "dup v12.4s, v11.s[3]\n"
       "dup v11.4s, v11.s[0]\n"
-      "mul v11.4s, v11.4s, v2.4s\n"
-      "mul v12.4s, v12.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v24.4s\n"
+      "mul v12.4s, v12.4s, v24.4s\n"
       "53:"  // Height 2: skip row sum fixup
-      "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q28, [x10, #0x0]\n"
+      "ldr q27, [x10, #0x10]\n"
       "add v23.4s, v23.4s, v11.4s\n"
       "add v20.4s, v20.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q26, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x30]\n"
       "add v21.4s, v21.4s, v11.4s\n"
       "add v22.4s, v22.4s, v11.4s\n"
       "add v16.4s, v16.4s, v12.4s\n"
       "add v17.4s, v17.4s, v12.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "add v18.4s, v18.4s, v12.4s\n"
       "add v19.4s, v19.4s, v12.4s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "add v23.4s, v23.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v1.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v27.4s\n"
       "add x10, x10, #0x40\n"
-      "add v21.4s, v21.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v3.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v25.4s\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v17.4s, v17.4s, v27.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v25.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v24.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v24.4s\n"
       "tbz %x[flags], #5, 54f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "and v5.16b, v20.16b, v0.16b\n"
-      "and v6.16b, v21.16b, v0.16b\n"
-      "and v7.16b, v22.16b, v0.16b\n"
-      "and v8.16b, v16.16b, v0.16b\n"
-      "and v9.16b, v17.16b, v0.16b\n"
-      "and v10.16b, v18.16b, v0.16b\n"
-      "and v4.16b, v19.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v20.4s, v20.4s, v5.4s\n"
-      "sqadd v21.4s, v21.4s, v6.4s\n"
-      "sqadd v22.4s, v22.4s, v7.4s\n"
-      "sqadd v16.4s, v16.4s, v8.4s\n"
-      "sqadd v17.4s, v17.4s, v9.4s\n"
-      "sqadd v18.4s, v18.4s, v10.4s\n"
-      "sqadd v19.4s, v19.4s, v4.4s\n"
+      "and v24.16b, v23.16b, v0.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
+      "and v30.16b, v20.16b, v0.16b\n"
+      "and v29.16b, v21.16b, v0.16b\n"
+      "and v28.16b, v22.16b, v0.16b\n"
+      "and v27.16b, v16.16b, v0.16b\n"
+      "and v26.16b, v17.16b, v0.16b\n"
+      "and v25.16b, v18.16b, v0.16b\n"
+      "and v24.16b, v19.16b, v0.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v30.4s\n"
+      "sqadd v21.4s, v21.4s, v29.4s\n"
+      "sqadd v22.4s, v22.4s, v28.4s\n"
+      "sqadd v16.4s, v16.4s, v27.4s\n"
+      "sqadd v17.4s, v17.4s, v26.4s\n"
+      "sqadd v18.4s, v18.4s, v25.4s\n"
+      "sqadd v19.4s, v19.4s, v24.4s\n"
       "54:"  // Height 2: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v25.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
       "cmp x9, #0x10\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "add v23.4s, v23.4s, v26.4s\n"
+      "add v20.4s, v20.4s, v26.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v16.4s, v16.4s, v26.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v26.4s\n"
+      "smin v23.4s, v23.4s, v25.4s\n"
+      "smin v20.4s, v20.4s, v25.4s\n"
+      "smin v21.4s, v21.4s, v25.4s\n"
+      "smin v22.4s, v22.4s, v25.4s\n"
+      "smin v16.4s, v16.4s, v25.4s\n"
+      "smin v17.4s, v17.4s, v25.4s\n"
+      "smin v18.4s, v18.4s, v25.4s\n"
+      "smin v19.4s, v19.4s, v25.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
       "uzp1 v23.8h, v23.8h, v20.8h\n"
       "uzp1 v20.8h, v21.8h, v22.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
@@ -774,68 +773,68 @@
       "bge 63f\n"
       "tbz x9, #3, 58f\n"
       "str d23, [x27], #0x8\n"
-      "str d16, [x22], #0x8\n"
+      "str d16, [x23], #0x8\n"
       "tbz x9, #2, 56f\n"
       "st1 { v23.s }[2], [x27], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
       "tbz x9, #1, 55f\n"
       "st1 { v23.h }[6], [x27], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
       "tbz x9, #0, 62f\n"
       "st1 { v23.b }[14], [x27]\n"
-      "st1 { v16.b }[14], [x22]\n"
+      "st1 { v16.b }[14], [x23]\n"
       "b 62f\n"
       "55:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x9, #0, 62f\n"
       "st1 { v23.b }[12], [x27]\n"
-      "st1 { v16.b }[12], [x22]\n"
+      "st1 { v16.b }[12], [x23]\n"
       "b 62f\n"
       "56:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x9, #1, 57f\n"
       "st1 { v23.h }[4], [x27], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
       "tbz x9, #0, 62f\n"
       "st1 { v23.b }[10], [x27]\n"
-      "st1 { v16.b }[10], [x22]\n"
+      "st1 { v16.b }[10], [x23]\n"
       "b 62f\n"
       "57:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x9, #0, 62f\n"
       "st1 { v23.b }[8], [x27]\n"
-      "st1 { v16.b }[8], [x22]\n"
+      "st1 { v16.b }[8], [x23]\n"
       "b 62f\n"
       "58:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x9, #2, 60f\n"
       "str s23, [x27], #0x4\n"
-      "str s16, [x22], #0x4\n"
+      "str s16, [x23], #0x4\n"
       "tbz x9, #1, 59f\n"
       "st1 { v23.h }[2], [x27], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
       "tbz x9, #0, 62f\n"
       "st1 { v23.b }[6], [x27]\n"
-      "st1 { v16.b }[6], [x22]\n"
+      "st1 { v16.b }[6], [x23]\n"
       "b 62f\n"
       "59:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x9, #0, 62f\n"
       "st1 { v23.b }[4], [x27]\n"
-      "st1 { v16.b }[4], [x22]\n"
+      "st1 { v16.b }[4], [x23]\n"
       "b 62f\n"
       "60:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x9, #1, 61f\n"
       "str h23, [x27], #0x2\n"
-      "str h16, [x22], #0x2\n"
+      "str h16, [x23], #0x2\n"
       "tbz x9, #0, 62f\n"
       "st1 { v23.b }[2], [x27]\n"
-      "st1 { v16.b }[2], [x22]\n"
+      "st1 { v16.b }[2], [x23]\n"
       "b 62f\n"
       "61:"  // Height 2: Partial direct writeback: partial_1_0
       "str b23, [x27, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
       "62:"  // Height 2: Partial direct writeback: Done
       "b 64f\n"
       "63:"  // Height 2: Full writeback
       "str q23, [x27, #0x0]\n"
       "add x27, x27, #0x10\n"
-      "str q16, [x22, #0x0]\n"
+      "str q16, [x23, #0x0]\n"
       "64:"  // Height 2: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 34b\n"
@@ -872,13 +871,13 @@
       "68:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 69f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
       "cbnz x26, 70f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -887,8 +886,8 @@
       "b 70f\n"
       "69:"  // Height 3: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "70:"  // Height 3: input setup done
       "cmp x25, #0x10\n"
       "blt 75f\n"
@@ -909,12 +908,12 @@
       ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
       "trn1 v2.2d, v3.2d, v4.2d\n"
       ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
-      "ldr q5, [x28, #0x70]\n"
+      "ldr q14, [x28, #0x70]\n"
       ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
-      "ldr q4, [x28, #0x60]\n"
+      "ldr q5, [x28, #0x60]\n"
       ".inst 0x4e86a45c  // smmla v28.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x28, #0x80]\n"
+      "ldr q4, [x28, #0x80]\n"
       ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
       ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
       "ldr q7, [x28, #0x90]\n"
@@ -930,15 +929,15 @@
       ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
       ".inst 0x4e8aa45e  // smmla v30.4s, v2.16b, v10.16b\n"
       "ldr q10, [x28, #0xc0]\n"
-      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
-      ".inst 0x4e84a45b  // smmla v27.4s, v2.16b, v4.16b\n"
-      "ldr q4, [x28, #0xd0]\n"
-      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
-      ".inst 0x4e85a45f  // smmla v31.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e85a413  // smmla v19.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45b  // smmla v27.4s, v2.16b, v5.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e8ea417  // smmla v23.4s, v0.16b, v14.16b\n"
+      ".inst 0x4e8ea45f  // smmla v31.4s, v2.16b, v14.16b\n"
       "ldr q5, [x28, #0xe0]\n"
-      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a478  // smmla v24.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x28, #0xf0]\n"
+      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a478  // smmla v24.4s, v3.16b, v4.16b\n"
+      "ldr q4, [x28, #0xf0]\n"
       "add x28, x28, #0x100\n"
       ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
       ".inst 0x4e87a47c  // smmla v28.4s, v3.16b, v7.16b\n"
@@ -948,12 +947,12 @@
       ".inst 0x4e89a47d  // smmla v29.4s, v3.16b, v9.16b\n"
       ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
       ".inst 0x4e8aa47a  // smmla v26.4s, v3.16b, v10.16b\n"
-      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
-      ".inst 0x4e84a47e  // smmla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e86a436  // smmla v22.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a47e  // smmla v30.4s, v3.16b, v6.16b\n"
       ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
       ".inst 0x4e85a47b  // smmla v27.4s, v3.16b, v5.16b\n"
-      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a47f  // smmla v31.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e84a437  // smmla v23.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a47f  // smmla v31.4s, v3.16b, v4.16b\n"
       "tbnz %x[flags], #31, 72f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
@@ -981,12 +980,12 @@
       ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
       "trn1 v2.2d, v3.2d, v4.2d\n"
       ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
-      "ldr q5, [x28, #0x70]\n"
+      "ldr q14, [x28, #0x70]\n"
       ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
-      "ldr q4, [x28, #0x60]\n"
+      "ldr q5, [x28, #0x60]\n"
       ".inst 0x4e86a45c  // smmla v28.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x28, #0x80]\n"
+      "ldr q4, [x28, #0x80]\n"
       ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
       ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
       "ldr q7, [x28, #0x90]\n"
@@ -1003,15 +1002,15 @@
       ".inst 0x4e8aa45e  // smmla v30.4s, v2.16b, v10.16b\n"
       "ldr q10, [x28, #0xc0]\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
-      ".inst 0x4e84a45b  // smmla v27.4s, v2.16b, v4.16b\n"
-      "ldr q4, [x28, #0xd0]\n"
-      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
-      ".inst 0x4e85a45f  // smmla v31.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e85a413  // smmla v19.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45b  // smmla v27.4s, v2.16b, v5.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e8ea417  // smmla v23.4s, v0.16b, v14.16b\n"
+      ".inst 0x4e8ea45f  // smmla v31.4s, v2.16b, v14.16b\n"
       "ldr q5, [x28, #0xe0]\n"
-      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a478  // smmla v24.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x28, #0xf0]\n"
+      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a478  // smmla v24.4s, v3.16b, v4.16b\n"
+      "ldr q4, [x28, #0xf0]\n"
       "add x28, x28, #0x100\n"
       ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
       ".inst 0x4e87a47c  // smmla v28.4s, v3.16b, v7.16b\n"
@@ -1021,12 +1020,12 @@
       ".inst 0x4e89a47d  // smmla v29.4s, v3.16b, v9.16b\n"
       ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
       ".inst 0x4e8aa47a  // smmla v26.4s, v3.16b, v10.16b\n"
-      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
-      ".inst 0x4e84a47e  // smmla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e86a436  // smmla v22.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a47e  // smmla v30.4s, v3.16b, v6.16b\n"
       ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
       ".inst 0x4e85a47b  // smmla v27.4s, v3.16b, v5.16b\n"
-      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a47f  // smmla v31.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e84a437  // smmla v23.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a47f  // smmla v31.4s, v3.16b, v4.16b\n"
       "tbnz %x[flags], #31, 74f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
@@ -1042,41 +1041,41 @@
       "blt 78f\n"
       "76:"  // Height 3: Multiply loop: Odd block loop
       "ldr d1, [x24], #0x8\n"
-      "ldr d2, [x23], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr d3, [x22], #0x8\n"
-      "trn1 v2.2d, v3.2d, v7.2d\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v0.2d, v1.2d, v0.2d\n"
+      "ldr d1, [x22], #0x8\n"
+      "trn1 v2.2d, v1.2d, v2.2d\n"
       "tbnz %x[flags], #31, 77f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "77:"  // Height 3: Multiply loop: unique 11: skip row sum
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      ".inst 0x4e88a410  // smmla v16.4s, v0.16b, v8.16b\n"
-      ".inst 0x4e88a458  // smmla v24.4s, v2.16b, v8.16b\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q4, [x28, #0x30]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q1, [x28, #0x10]\n"
+      ".inst 0x4e83a410  // smmla v16.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a458  // smmla v24.4s, v2.16b, v3.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
       "sub x25, x25, #0x8\n"
       "cmp x25, #0x8\n"
       "ldr q5, [x28, #0x40]\n"
-      "ldr q6, [x28, #0x50]\n"
-      ".inst 0x4e89a414  // smmla v20.4s, v0.16b, v9.16b\n"
-      ".inst 0x4e89a45c  // smmla v28.4s, v2.16b, v9.16b\n"
-      "ldr q7, [x28, #0x60]\n"
-      "ldr q8, [x28, #0x70]\n"
-      ".inst 0x4e8aa411  // smmla v17.4s, v0.16b, v10.16b\n"
-      ".inst 0x4e8aa459  // smmla v25.4s, v2.16b, v10.16b\n"
-      ".inst 0x4e84a415  // smmla v21.4s, v0.16b, v4.16b\n"
-      ".inst 0x4e84a45d  // smmla v29.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x4e81a414  // smmla v20.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a45c  // smmla v28.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a415  // smmla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a45d  // smmla v29.4s, v2.16b, v6.16b\n"
       "add x28, x28, #0x80\n"
       ".inst 0x4e85a412  // smmla v18.4s, v0.16b, v5.16b\n"
       ".inst 0x4e85a45a  // smmla v26.4s, v2.16b, v5.16b\n"
-      ".inst 0x4e86a416  // smmla v22.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a45e  // smmla v30.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a413  // smmla v19.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a45b  // smmla v27.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e88a417  // smmla v23.4s, v0.16b, v8.16b\n"
-      ".inst 0x4e88a45f  // smmla v31.4s, v2.16b, v8.16b\n"
+      ".inst 0x4e84a416  // smmla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a45e  // smmla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e83a413  // smmla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a45b  // smmla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e81a417  // smmla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a45f  // smmla v31.4s, v2.16b, v1.16b\n"
       "bge 76b\n"
       "78:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x25, 84f\n"
@@ -1115,52 +1114,52 @@
       "ldr b3, [x22, #0x0]\n"
       "82:"  // Height 3: Multiply loop: Ragged operand read: Done
       "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v9.2d\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
       "tbnz %x[flags], #31, 83f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "83:"  // Height 3: Multiply loop: unique 12: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x4e8aa410  // smmla v16.4s, v0.16b, v10.16b\n"
-      ".inst 0x4e8aa458  // smmla v24.4s, v2.16b, v10.16b\n"
-      "ldr q5, [x28, #0x20]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q3, [x28, #0x10]\n"
+      ".inst 0x4e81a410  // smmla v16.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a458  // smmla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x28, #0x20]\n"
       "ldr q6, [x28, #0x30]\n"
-      ".inst 0x4e84a414  // smmla v20.4s, v0.16b, v4.16b\n"
-      ".inst 0x4e84a45c  // smmla v28.4s, v2.16b, v4.16b\n"
-      "ldr q7, [x28, #0x40]\n"
-      "ldr q8, [x28, #0x50]\n"
-      ".inst 0x4e85a411  // smmla v17.4s, v0.16b, v5.16b\n"
-      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
-      "ldr q9, [x28, #0x60]\n"
-      "ldr q10, [x28, #0x70]\n"
+      ".inst 0x4e83a414  // smmla v20.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a45c  // smmla v28.4s, v2.16b, v3.16b\n"
+      "ldr q5, [x28, #0x40]\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a459  // smmla v25.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
       ".inst 0x4e86a415  // smmla v21.4s, v0.16b, v6.16b\n"
       ".inst 0x4e86a45d  // smmla v29.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a412  // smmla v18.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a45a  // smmla v26.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e85a412  // smmla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45a  // smmla v26.4s, v2.16b, v5.16b\n"
       "add x28, x28, #0x80\n"
-      ".inst 0x4e88a416  // smmla v22.4s, v0.16b, v8.16b\n"
-      ".inst 0x4e88a45e  // smmla v30.4s, v2.16b, v8.16b\n"
-      ".inst 0x4e89a413  // smmla v19.4s, v0.16b, v9.16b\n"
-      ".inst 0x4e89a45b  // smmla v27.4s, v2.16b, v9.16b\n"
-      ".inst 0x4e8aa417  // smmla v23.4s, v0.16b, v10.16b\n"
-      ".inst 0x4e8aa45f  // smmla v31.4s, v2.16b, v10.16b\n"
+      ".inst 0x4e84a416  // smmla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a45e  // smmla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e83a413  // smmla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a45b  // smmla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e81a417  // smmla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a45f  // smmla v31.4s, v2.16b, v1.16b\n"
       "84:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
       "cmp x26, x20\n"
       "bne 68b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "uzp1 v4.2d, v16.2d, v20.2d\n"
-      "add x22, x27, x20\n"
-      "add x21, x22, x20\n"
+      "uzp1 v0.2d, v16.2d, v20.2d\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
       "uzp2 v16.2d, v16.2d, v20.2d\n"
       "uzp1 v20.2d, v17.2d, v21.2d\n"
       "prfm pstl1keep, [x27, #0x0]\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "uzp2 v17.2d, v17.2d, v21.2d\n"
       "uzp1 v21.2d, v18.2d, v22.2d\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
       "uzp2 v18.2d, v18.2d, v22.2d\n"
       "uzp1 v22.2d, v19.2d, v23.2d\n"
       "uzp2 v19.2d, v19.2d, v23.2d\n"
@@ -1168,116 +1167,116 @@
       "uzp1 v25.2d, v25.2d, v29.2d\n"
       "uzp1 v26.2d, v26.2d, v30.2d\n"
       "uzp1 v27.2d, v27.2d, v31.2d\n"
-      "mov v31.16b, v4.16b\n"
+      "mov v31.16b, v0.16b\n"
       "tbnz %x[flags], #31, 85f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v3.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v23.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "neg v3.4s, v3.4s\n"
+      "neg v23.4s, v23.4s\n"
       "dup v12.4s, v11.s[3]\n"
       "dup v11.4s, v11.s[0]\n"
       "dup v13.4s, v13.s[0]\n"
-      "mul v11.4s, v11.4s, v3.4s\n"
-      "mul v12.4s, v12.4s, v3.4s\n"
-      "mul v13.4s, v13.4s, v3.4s\n"
+      "mul v11.4s, v11.4s, v23.4s\n"
+      "mul v12.4s, v12.4s, v23.4s\n"
+      "mul v13.4s, v13.4s, v23.4s\n"
       "85:"  // Height 3: skip row sum fixup
       "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q30, [x10, #0x10]\n"
       "add v31.4s, v31.4s, v11.4s\n"
       "add v20.4s, v20.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q29, [x10, #0x20]\n"
+      "ldr q28, [x10, #0x30]\n"
       "add v21.4s, v21.4s, v11.4s\n"
       "add v22.4s, v22.4s, v11.4s\n"
       "add v16.4s, v16.4s, v12.4s\n"
       "add v17.4s, v17.4s, v12.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v23.4s }, [x20]\n"
       "add v18.4s, v18.4s, v12.4s\n"
       "add v19.4s, v19.4s, v12.4s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add v24.4s, v24.4s, v13.4s\n"
       "add v25.4s, v25.4s, v13.4s\n"
       "add x10, x10, #0x40\n"
       "add v26.4s, v26.4s, v13.4s\n"
       "add v27.4s, v27.4s, v13.4s\n"
       "add v31.4s, v31.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v1.4s\n"
-      "add v21.4s, v21.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v30.4s\n"
+      "add v21.4s, v21.4s, v29.4s\n"
+      "add v22.4s, v22.4s, v28.4s\n"
       "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
       "add v24.4s, v24.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v23.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v23.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v23.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v23.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v23.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v23.4s\n"
       "tbz %x[flags], #5, 86f\n"
-      "and v4.16b, v31.16b, v0.16b\n"
-      "and v5.16b, v20.16b, v0.16b\n"
-      "and v6.16b, v21.16b, v0.16b\n"
-      "and v7.16b, v22.16b, v0.16b\n"
-      "and v8.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v31.4s, v31.4s, v4.4s\n"
-      "sqadd v20.4s, v20.4s, v5.4s\n"
-      "sqadd v21.4s, v21.4s, v6.4s\n"
-      "sqadd v22.4s, v22.4s, v7.4s\n"
-      "sqadd v16.4s, v16.4s, v8.4s\n"
-      "and v9.16b, v17.16b, v0.16b\n"
-      "and v10.16b, v18.16b, v0.16b\n"
-      "and v4.16b, v19.16b, v0.16b\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v9.4s\n"
-      "sqadd v18.4s, v18.4s, v10.4s\n"
-      "sqadd v19.4s, v19.4s, v4.4s\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "and v1.16b, v31.16b, v0.16b\n"
+      "and v30.16b, v20.16b, v0.16b\n"
+      "and v29.16b, v21.16b, v0.16b\n"
+      "and v28.16b, v22.16b, v0.16b\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v1.4s\n"
+      "sqadd v20.4s, v20.4s, v30.4s\n"
+      "sqadd v21.4s, v21.4s, v29.4s\n"
+      "sqadd v22.4s, v22.4s, v28.4s\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "and v3.16b, v17.16b, v0.16b\n"
+      "and v2.16b, v18.16b, v0.16b\n"
+      "and v1.16b, v19.16b, v0.16b\n"
+      "and v30.16b, v24.16b, v0.16b\n"
+      "and v29.16b, v25.16b, v0.16b\n"
+      "and v28.16b, v26.16b, v0.16b\n"
+      "and v23.16b, v27.16b, v0.16b\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v3.4s\n"
+      "sqadd v18.4s, v18.4s, v2.4s\n"
+      "sqadd v19.4s, v19.4s, v1.4s\n"
+      "sqadd v24.4s, v24.4s, v30.4s\n"
+      "sqadd v25.4s, v25.4s, v29.4s\n"
+      "sqadd v26.4s, v26.4s, v28.4s\n"
+      "sqadd v27.4s, v27.4s, v23.4s\n"
       "86:"  // Height 3: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v29.4s }, [x20]\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v23.4s }, [x20]\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
       "cmp x9, #0x10\n"
@@ -1285,132 +1284,132 @@
       "srshl v25.4s, v25.4s, v0.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v31.4s, v31.4s, v29.4s\n"
+      "add v20.4s, v20.4s, v29.4s\n"
+      "add v21.4s, v21.4s, v29.4s\n"
+      "add v22.4s, v22.4s, v29.4s\n"
+      "add v16.4s, v16.4s, v29.4s\n"
+      "add v17.4s, v17.4s, v29.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v29.4s\n"
+      "add v24.4s, v24.4s, v29.4s\n"
+      "add v25.4s, v25.4s, v29.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v29.4s\n"
+      "smin v31.4s, v31.4s, v28.4s\n"
+      "smin v20.4s, v20.4s, v28.4s\n"
+      "smin v21.4s, v21.4s, v28.4s\n"
+      "smin v22.4s, v22.4s, v28.4s\n"
+      "smin v16.4s, v16.4s, v28.4s\n"
+      "smin v17.4s, v17.4s, v28.4s\n"
+      "smin v18.4s, v18.4s, v28.4s\n"
+      "smin v19.4s, v19.4s, v28.4s\n"
+      "smin v24.4s, v24.4s, v28.4s\n"
+      "smin v25.4s, v25.4s, v28.4s\n"
+      "smin v26.4s, v26.4s, v28.4s\n"
+      "smin v27.4s, v27.4s, v28.4s\n"
+      "smax v31.4s, v31.4s, v23.4s\n"
+      "smax v20.4s, v20.4s, v23.4s\n"
+      "smax v21.4s, v21.4s, v23.4s\n"
+      "smax v22.4s, v22.4s, v23.4s\n"
+      "smax v16.4s, v16.4s, v23.4s\n"
+      "smax v17.4s, v17.4s, v23.4s\n"
+      "smax v18.4s, v18.4s, v23.4s\n"
+      "smax v19.4s, v19.4s, v23.4s\n"
+      "smax v24.4s, v24.4s, v23.4s\n"
+      "smax v25.4s, v25.4s, v23.4s\n"
+      "smax v26.4s, v26.4s, v23.4s\n"
+      "smax v27.4s, v27.4s, v23.4s\n"
       "uzp1 v31.8h, v31.8h, v20.8h\n"
       "uzp1 v20.8h, v21.8h, v22.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
       "uzp1 v31.16b, v31.16b, v20.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 95f\n"
       "tbz x9, #3, 90f\n"
       "str d31, [x27], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
       "tbz x9, #2, 88f\n"
       "st1 { v31.s }[2], [x27], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
       "tbz x9, #1, 87f\n"
       "st1 { v31.h }[6], [x27], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
       "tbz x9, #0, 94f\n"
       "st1 { v31.b }[14], [x27]\n"
-      "st1 { v16.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 94f\n"
       "87:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x9, #0, 94f\n"
       "st1 { v31.b }[12], [x27]\n"
-      "st1 { v16.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 94f\n"
       "88:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x9, #1, 89f\n"
       "st1 { v31.h }[4], [x27], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
       "tbz x9, #0, 94f\n"
       "st1 { v31.b }[10], [x27]\n"
-      "st1 { v16.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 94f\n"
       "89:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x9, #0, 94f\n"
       "st1 { v31.b }[8], [x27]\n"
-      "st1 { v16.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 94f\n"
       "90:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x9, #2, 92f\n"
       "str s31, [x27], #0x4\n"
-      "str s16, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
       "tbz x9, #1, 91f\n"
       "st1 { v31.h }[2], [x27], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
       "tbz x9, #0, 94f\n"
       "st1 { v31.b }[6], [x27]\n"
-      "st1 { v16.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 94f\n"
       "91:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x9, #0, 94f\n"
       "st1 { v31.b }[4], [x27]\n"
-      "st1 { v16.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 94f\n"
       "92:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x9, #1, 93f\n"
       "str h31, [x27], #0x2\n"
-      "str h16, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
       "tbz x9, #0, 94f\n"
       "st1 { v31.b }[2], [x27]\n"
-      "st1 { v16.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 94f\n"
       "93:"  // Height 3: Partial direct writeback: partial_1_0
       "str b31, [x27, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "94:"  // Height 3: Partial direct writeback: Done
       "b 96f\n"
       "95:"  // Height 3: Full writeback
       "str q31, [x27, #0x0]\n"
       "add x27, x27, #0x10\n"
-      "str q16, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "96:"  // Height 3: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 66b\n"
@@ -1451,14 +1450,14 @@
       "100:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 101f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
       "cbnz x26, 102f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -1468,9 +1467,9 @@
       "b 102f\n"
       "101:"  // Height 4: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "102:"  // Height 4: input setup done
       "cmp x25, #0x10\n"
       "blt 107f\n"
@@ -1630,42 +1629,42 @@
       "blt 110f\n"
       "108:"  // Height 4: Multiply loop: Odd block loop
       "ldr d1, [x24], #0x8\n"
-      "ldr d2, [x23], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr d3, [x22], #0x8\n"
-      "ldr d7, [x21], #0x8\n"
-      "trn1 v2.2d, v3.2d, v7.2d\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v0.2d, v1.2d, v0.2d\n"
+      "ldr d2, [x22], #0x8\n"
+      "ldr d1, [x21], #0x8\n"
+      "trn1 v2.2d, v2.2d, v1.2d\n"
       "tbnz %x[flags], #31, 109f\n"
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "109:"  // Height 4: Multiply loop: unique 15: skip row sum
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      ".inst 0x4e88a410  // smmla v16.4s, v0.16b, v8.16b\n"
-      ".inst 0x4e88a458  // smmla v24.4s, v2.16b, v8.16b\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q4, [x28, #0x30]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q1, [x28, #0x10]\n"
+      ".inst 0x4e83a410  // smmla v16.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a458  // smmla v24.4s, v2.16b, v3.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
       "sub x25, x25, #0x8\n"
       "cmp x25, #0x8\n"
       "ldr q5, [x28, #0x40]\n"
-      "ldr q6, [x28, #0x50]\n"
-      ".inst 0x4e89a414  // smmla v20.4s, v0.16b, v9.16b\n"
-      ".inst 0x4e89a45c  // smmla v28.4s, v2.16b, v9.16b\n"
-      "ldr q7, [x28, #0x60]\n"
-      "ldr q8, [x28, #0x70]\n"
-      ".inst 0x4e8aa411  // smmla v17.4s, v0.16b, v10.16b\n"
-      ".inst 0x4e8aa459  // smmla v25.4s, v2.16b, v10.16b\n"
-      ".inst 0x4e84a415  // smmla v21.4s, v0.16b, v4.16b\n"
-      ".inst 0x4e84a45d  // smmla v29.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x4e81a414  // smmla v20.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a45c  // smmla v28.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a415  // smmla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a45d  // smmla v29.4s, v2.16b, v6.16b\n"
       "add x28, x28, #0x80\n"
       ".inst 0x4e85a412  // smmla v18.4s, v0.16b, v5.16b\n"
       ".inst 0x4e85a45a  // smmla v26.4s, v2.16b, v5.16b\n"
-      ".inst 0x4e86a416  // smmla v22.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a45e  // smmla v30.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a413  // smmla v19.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a45b  // smmla v27.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e88a417  // smmla v23.4s, v0.16b, v8.16b\n"
-      ".inst 0x4e88a45f  // smmla v31.4s, v2.16b, v8.16b\n"
+      ".inst 0x4e84a416  // smmla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a45e  // smmla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e83a413  // smmla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a45b  // smmla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e81a417  // smmla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a45f  // smmla v31.4s, v2.16b, v1.16b\n"
       "bge 108b\n"
       "110:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x25, 116f\n"
@@ -1716,51 +1715,51 @@
       ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
       "115:"  // Height 4: Multiply loop: unique 16: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x4e8aa410  // smmla v16.4s, v0.16b, v10.16b\n"
-      ".inst 0x4e8aa458  // smmla v24.4s, v2.16b, v10.16b\n"
-      "ldr q5, [x28, #0x20]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q3, [x28, #0x10]\n"
+      ".inst 0x4e81a410  // smmla v16.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a458  // smmla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x28, #0x20]\n"
       "ldr q6, [x28, #0x30]\n"
-      ".inst 0x4e84a414  // smmla v20.4s, v0.16b, v4.16b\n"
-      ".inst 0x4e84a45c  // smmla v28.4s, v2.16b, v4.16b\n"
-      "ldr q7, [x28, #0x40]\n"
-      "ldr q8, [x28, #0x50]\n"
-      ".inst 0x4e85a411  // smmla v17.4s, v0.16b, v5.16b\n"
-      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
-      "ldr q9, [x28, #0x60]\n"
-      "ldr q10, [x28, #0x70]\n"
+      ".inst 0x4e83a414  // smmla v20.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a45c  // smmla v28.4s, v2.16b, v3.16b\n"
+      "ldr q5, [x28, #0x40]\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a459  // smmla v25.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
       ".inst 0x4e86a415  // smmla v21.4s, v0.16b, v6.16b\n"
       ".inst 0x4e86a45d  // smmla v29.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a412  // smmla v18.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a45a  // smmla v26.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e85a412  // smmla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45a  // smmla v26.4s, v2.16b, v5.16b\n"
       "add x28, x28, #0x80\n"
-      ".inst 0x4e88a416  // smmla v22.4s, v0.16b, v8.16b\n"
-      ".inst 0x4e88a45e  // smmla v30.4s, v2.16b, v8.16b\n"
-      ".inst 0x4e89a413  // smmla v19.4s, v0.16b, v9.16b\n"
-      ".inst 0x4e89a45b  // smmla v27.4s, v2.16b, v9.16b\n"
-      ".inst 0x4e8aa417  // smmla v23.4s, v0.16b, v10.16b\n"
-      ".inst 0x4e8aa45f  // smmla v31.4s, v2.16b, v10.16b\n"
+      ".inst 0x4e84a416  // smmla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a45e  // smmla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e83a413  // smmla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e83a45b  // smmla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e81a417  // smmla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x4e81a45f  // smmla v31.4s, v2.16b, v1.16b\n"
       "116:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
       "cmp x26, x20\n"
       "bne 100b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "uzp1 v4.2d, v16.2d, v20.2d\n"
-      "add x22, x27, x20\n"
+      "uzp1 v0.2d, v16.2d, v20.2d\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
       "add x21, x22, x20\n"
-      "add x20, x21, x20\n"
       "uzp2 v16.2d, v16.2d, v20.2d\n"
       "uzp1 v20.2d, v17.2d, v21.2d\n"
       "prfm pstl1keep, [x27, #0x0]\n"
       "uzp2 v17.2d, v17.2d, v21.2d\n"
       "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "uzp2 v18.2d, v18.2d, v22.2d\n"
       "uzp1 v22.2d, v19.2d, v23.2d\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
       "uzp2 v19.2d, v19.2d, v23.2d\n"
       "uzp1 v23.2d, v24.2d, v28.2d\n"
       "uzp2 v24.2d, v24.2d, v28.2d\n"
@@ -1770,38 +1769,38 @@
       "uzp2 v26.2d, v26.2d, v30.2d\n"
       "uzp1 v30.2d, v27.2d, v31.2d\n"
       "uzp2 v27.2d, v27.2d, v31.2d\n"
-      "mov v31.16b, v4.16b\n"
+      "mov v31.16b, v0.16b\n"
       "tbnz %x[flags], #31, 117f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "neg v4.4s, v4.4s\n"
+      "neg v0.4s, v0.4s\n"
       "dup v12.4s, v11.s[3]\n"
       "dup v11.4s, v11.s[0]\n"
       "dup v14.4s, v13.s[3]\n"
       "dup v13.4s, v13.s[0]\n"
-      "mul v11.4s, v11.4s, v4.4s\n"
-      "mul v12.4s, v12.4s, v4.4s\n"
-      "mul v13.4s, v13.4s, v4.4s\n"
-      "mul v14.4s, v14.4s, v4.4s\n"
+      "mul v11.4s, v11.4s, v0.4s\n"
+      "mul v12.4s, v12.4s, v0.4s\n"
+      "mul v13.4s, v13.4s, v0.4s\n"
+      "mul v14.4s, v14.4s, v0.4s\n"
       "117:"  // Height 4: skip row sum fixup
       "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q4, [x10, #0x10]\n"
       "add v31.4s, v31.4s, v11.4s\n"
       "add v20.4s, v20.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q3, [x10, #0x20]\n"
+      "ldr q2, [x10, #0x30]\n"
       "add v21.4s, v21.4s, v11.4s\n"
       "add v22.4s, v22.4s, v11.4s\n"
       "add v16.4s, v16.4s, v12.4s\n"
       "add v17.4s, v17.4s, v12.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "add v18.4s, v18.4s, v12.4s\n"
       "add v19.4s, v19.4s, v12.4s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add v23.4s, v23.4s, v13.4s\n"
       "add v28.4s, v28.4s, v13.4s\n"
       "add x10, x10, #0x40\n"
@@ -1812,100 +1811,100 @@
       "add v26.4s, v26.4s, v14.4s\n"
       "add v27.4s, v27.4s, v14.4s\n"
       "add v31.4s, v31.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v1.4s\n"
-      "add v21.4s, v21.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v3.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
       "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v2.4s\n"
       "add v23.4s, v23.4s, v0.4s\n"
-      "add v28.4s, v28.4s, v1.4s\n"
-      "add v29.4s, v29.4s, v2.4s\n"
-      "add v30.4s, v30.4s, v3.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
+      "add v29.4s, v29.4s, v3.4s\n"
+      "add v30.4s, v30.4s, v2.4s\n"
       "add v24.4s, v24.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
-      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
-      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v2.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v1.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v1.4s\n"
       "tbz %x[flags], #5, 118f\n"
-      "and v4.16b, v31.16b, v0.16b\n"
-      "and v5.16b, v20.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v31.4s, v31.4s, v4.4s\n"
-      "sqadd v20.4s, v20.4s, v5.4s\n"
-      "and v6.16b, v21.16b, v0.16b\n"
-      "and v7.16b, v22.16b, v0.16b\n"
-      "and v8.16b, v16.16b, v0.16b\n"
-      "and v9.16b, v17.16b, v0.16b\n"
-      "and v10.16b, v18.16b, v0.16b\n"
-      "and v4.16b, v19.16b, v0.16b\n"
-      "and v5.16b, v23.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v2.16b, v31.16b, v0.16b\n"
+      "and v1.16b, v20.16b, v0.16b\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v2.4s\n"
+      "sqadd v20.4s, v20.4s, v1.4s\n"
+      "and v7.16b, v21.16b, v0.16b\n"
+      "and v6.16b, v22.16b, v0.16b\n"
+      "and v5.16b, v16.16b, v0.16b\n"
+      "and v4.16b, v17.16b, v0.16b\n"
+      "and v3.16b, v18.16b, v0.16b\n"
+      "and v2.16b, v19.16b, v0.16b\n"
+      "and v1.16b, v23.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v21.4s, v21.4s, v6.4s\n"
-      "sqadd v22.4s, v22.4s, v7.4s\n"
-      "sqadd v16.4s, v16.4s, v8.4s\n"
-      "sqadd v17.4s, v17.4s, v9.4s\n"
-      "sqadd v18.4s, v18.4s, v10.4s\n"
-      "sqadd v19.4s, v19.4s, v4.4s\n"
-      "sqadd v23.4s, v23.4s, v5.4s\n"
-      "and v6.16b, v28.16b, v0.16b\n"
-      "and v7.16b, v29.16b, v0.16b\n"
-      "and v8.16b, v30.16b, v0.16b\n"
-      "and v9.16b, v24.16b, v0.16b\n"
-      "and v10.16b, v25.16b, v0.16b\n"
-      "and v4.16b, v26.16b, v0.16b\n"
-      "and v5.16b, v27.16b, v0.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v28.4s, v28.4s, v6.4s\n"
-      "sqadd v29.4s, v29.4s, v7.4s\n"
-      "sqadd v30.4s, v30.4s, v8.4s\n"
-      "sqadd v24.4s, v24.4s, v9.4s\n"
-      "sqadd v25.4s, v25.4s, v10.4s\n"
-      "sqadd v26.4s, v26.4s, v4.4s\n"
-      "sqadd v27.4s, v27.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v7.4s\n"
+      "sqadd v22.4s, v22.4s, v6.4s\n"
+      "sqadd v16.4s, v16.4s, v5.4s\n"
+      "sqadd v17.4s, v17.4s, v4.4s\n"
+      "sqadd v18.4s, v18.4s, v3.4s\n"
+      "sqadd v19.4s, v19.4s, v2.4s\n"
+      "sqadd v23.4s, v23.4s, v1.4s\n"
+      "and v7.16b, v28.16b, v0.16b\n"
+      "and v6.16b, v29.16b, v0.16b\n"
+      "and v5.16b, v30.16b, v0.16b\n"
+      "and v4.16b, v24.16b, v0.16b\n"
+      "and v3.16b, v25.16b, v0.16b\n"
+      "and v2.16b, v26.16b, v0.16b\n"
+      "and v1.16b, v27.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v7.4s\n"
+      "sqadd v29.4s, v29.4s, v6.4s\n"
+      "sqadd v30.4s, v30.4s, v5.4s\n"
+      "sqadd v24.4s, v24.4s, v4.4s\n"
+      "sqadd v25.4s, v25.4s, v3.4s\n"
+      "sqadd v26.4s, v26.4s, v2.4s\n"
+      "sqadd v27.4s, v27.4s, v1.4s\n"
       "118:"  // Height 4: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v3.4s }, [x20]\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v2.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
       "cmp x9, #0x10\n"
@@ -1917,163 +1916,163 @@
       "srshl v25.4s, v25.4s, v0.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v31.4s, v31.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v3.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v3.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v3.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v3.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "smin v31.4s, v31.4s, v2.4s\n"
+      "smin v20.4s, v20.4s, v2.4s\n"
+      "smin v21.4s, v21.4s, v2.4s\n"
+      "smin v22.4s, v22.4s, v2.4s\n"
+      "smin v16.4s, v16.4s, v2.4s\n"
+      "smin v17.4s, v17.4s, v2.4s\n"
+      "smin v18.4s, v18.4s, v2.4s\n"
+      "smin v19.4s, v19.4s, v2.4s\n"
+      "smin v23.4s, v23.4s, v2.4s\n"
+      "smin v28.4s, v28.4s, v2.4s\n"
+      "smin v29.4s, v29.4s, v2.4s\n"
+      "smin v30.4s, v30.4s, v2.4s\n"
+      "smin v24.4s, v24.4s, v2.4s\n"
+      "smin v25.4s, v25.4s, v2.4s\n"
+      "smin v26.4s, v26.4s, v2.4s\n"
+      "smin v27.4s, v27.4s, v2.4s\n"
+      "smax v31.4s, v31.4s, v1.4s\n"
+      "smax v20.4s, v20.4s, v1.4s\n"
+      "smax v21.4s, v21.4s, v1.4s\n"
+      "smax v22.4s, v22.4s, v1.4s\n"
+      "smax v16.4s, v16.4s, v1.4s\n"
+      "smax v17.4s, v17.4s, v1.4s\n"
+      "smax v18.4s, v18.4s, v1.4s\n"
+      "smax v19.4s, v19.4s, v1.4s\n"
+      "smax v23.4s, v23.4s, v1.4s\n"
+      "smax v28.4s, v28.4s, v1.4s\n"
+      "smax v29.4s, v29.4s, v1.4s\n"
+      "smax v30.4s, v30.4s, v1.4s\n"
+      "smax v24.4s, v24.4s, v1.4s\n"
+      "smax v25.4s, v25.4s, v1.4s\n"
+      "smax v26.4s, v26.4s, v1.4s\n"
+      "smax v27.4s, v27.4s, v1.4s\n"
       "uzp1 v31.8h, v31.8h, v20.8h\n"
       "uzp1 v20.8h, v21.8h, v22.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v23.8h, v23.8h, v28.8h\n"
-      "uzp1 v28.8h, v29.8h, v30.8h\n"
+      "uzp1 v18.8h, v29.8h, v30.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
       "uzp1 v31.16b, v31.16b, v20.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v23.16b, v23.16b, v28.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v23.16b, v23.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 127f\n"
       "tbz x9, #3, 122f\n"
       "str d31, [x27], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
       "tbz x9, #2, 120f\n"
       "st1 { v31.s }[2], [x27], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
-      "st1 { v23.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v23.s }[2], [x22], #0x4\n"
+      "st1 { v24.s }[2], [x21], #0x4\n"
       "tbz x9, #1, 119f\n"
       "st1 { v31.h }[6], [x27], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
-      "st1 { v23.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v23.h }[6], [x22], #0x2\n"
+      "st1 { v24.h }[6], [x21], #0x2\n"
       "tbz x9, #0, 126f\n"
       "st1 { v31.b }[14], [x27]\n"
-      "st1 { v16.b }[14], [x22]\n"
-      "st1 { v23.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v23.b }[14], [x22]\n"
+      "st1 { v24.b }[14], [x21]\n"
       "b 126f\n"
       "119:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x9, #0, 126f\n"
       "st1 { v31.b }[12], [x27]\n"
-      "st1 { v16.b }[12], [x22]\n"
-      "st1 { v23.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v23.b }[12], [x22]\n"
+      "st1 { v24.b }[12], [x21]\n"
       "b 126f\n"
       "120:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x9, #1, 121f\n"
       "st1 { v31.h }[4], [x27], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
-      "st1 { v23.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v23.h }[4], [x22], #0x2\n"
+      "st1 { v24.h }[4], [x21], #0x2\n"
       "tbz x9, #0, 126f\n"
       "st1 { v31.b }[10], [x27]\n"
-      "st1 { v16.b }[10], [x22]\n"
-      "st1 { v23.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v23.b }[10], [x22]\n"
+      "st1 { v24.b }[10], [x21]\n"
       "b 126f\n"
       "121:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x9, #0, 126f\n"
       "st1 { v31.b }[8], [x27]\n"
-      "st1 { v16.b }[8], [x22]\n"
-      "st1 { v23.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v23.b }[8], [x22]\n"
+      "st1 { v24.b }[8], [x21]\n"
       "b 126f\n"
       "122:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x9, #2, 124f\n"
       "str s31, [x27], #0x4\n"
-      "str s16, [x22], #0x4\n"
-      "str s23, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "str s23, [x22], #0x4\n"
+      "str s24, [x21], #0x4\n"
       "tbz x9, #1, 123f\n"
       "st1 { v31.h }[2], [x27], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
-      "st1 { v23.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v23.h }[2], [x22], #0x2\n"
+      "st1 { v24.h }[2], [x21], #0x2\n"
       "tbz x9, #0, 126f\n"
       "st1 { v31.b }[6], [x27]\n"
-      "st1 { v16.b }[6], [x22]\n"
-      "st1 { v23.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v23.b }[6], [x22]\n"
+      "st1 { v24.b }[6], [x21]\n"
       "b 126f\n"
       "123:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x9, #0, 126f\n"
       "st1 { v31.b }[4], [x27]\n"
-      "st1 { v16.b }[4], [x22]\n"
-      "st1 { v23.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v23.b }[4], [x22]\n"
+      "st1 { v24.b }[4], [x21]\n"
       "b 126f\n"
       "124:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x9, #1, 125f\n"
       "str h31, [x27], #0x2\n"
-      "str h16, [x22], #0x2\n"
-      "str h23, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "str h23, [x22], #0x2\n"
+      "str h24, [x21], #0x2\n"
       "tbz x9, #0, 126f\n"
       "st1 { v31.b }[2], [x27]\n"
-      "st1 { v16.b }[2], [x22]\n"
-      "st1 { v23.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v23.b }[2], [x22]\n"
+      "st1 { v24.b }[2], [x21]\n"
       "b 126f\n"
       "125:"  // Height 4: Partial direct writeback: partial_1_0
       "str b31, [x27, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
-      "str b23, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "str b23, [x22, #0x0]\n"
+      "str b24, [x21, #0x0]\n"
       "126:"  // Height 4: Partial direct writeback: Done
       "b 128f\n"
       "127:"  // Height 4: Full writeback
       "str q31, [x27, #0x0]\n"
       "add x27, x27, #0x10\n"
-      "str q16, [x22, #0x0]\n"
-      "str q23, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q23, [x22, #0x0]\n"
+      "str q24, [x21, #0x0]\n"
       "128:"  // Height 4: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 98b\n"
@@ -2089,7 +2088,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "130:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
index b028a8a..2b7531d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -108,5 +108,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
index b97b63c..38a57b0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
@@ -85,7 +85,6 @@
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 136f\n"
@@ -111,11 +110,11 @@
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w13, [x20, x14, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x12, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x12, [x20, #0x0]\n"
       "cbnz x14, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x12, x12, x20\n"
@@ -132,129 +131,129 @@
       "blt 8f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d6, [x15, #0x20]\n"
+      "ldr d17, [x15, #0x20]\n"
       "ldr x20, [x15, #0x28]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr d7, [x15, #0x30]\n"
-      "mov v6.d[1], x20\n"
-      "ldr x11, [x15, #0x38]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr d6, [x15, #0x40]\n"
+      "ldr d16, [x15, #0x30]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x38]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr d17, [x15, #0x40]\n"
       "ldr x20, [x15, #0x48]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr d7, [x15, #0x50]\n"
-      "mov v6.d[1], x20\n"
-      "ldr x11, [x15, #0x58]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr d6, [x15, #0x60]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr d16, [x15, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr d17, [x15, #0x60]\n"
       "ldr x20, [x15, #0x68]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr d7, [x15, #0x70]\n"
-      "mov v6.d[1], x20\n"
-      "ldr x11, [x15, #0x78]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr d6, [x15, #0x80]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr d16, [x15, #0x70]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x78]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr d17, [x15, #0x80]\n"
       "ldr x20, [x15, #0x88]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr d7, [x15, #0x90]\n"
-      "mov v6.d[1], x20\n"
-      "ldr x11, [x15, #0x98]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr d6, [x15, #0xa0]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr d16, [x15, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr d17, [x15, #0xa0]\n"
       "ldr x20, [x15, #0xa8]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr d7, [x15, #0xb0]\n"
-      "mov v6.d[1], x20\n"
-      "ldr x11, [x15, #0xb8]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr d6, [x15, #0xc0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr d16, [x15, #0xb0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0xb8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr d17, [x15, #0xc0]\n"
       "ldr x20, [x15, #0xc8]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr d7, [x15, #0xd0]\n"
-      "mov v6.d[1], x20\n"
-      "ldr x11, [x15, #0xd8]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr d6, [x15, #0xe0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr d16, [x15, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr d17, [x15, #0xe0]\n"
       "ldr x20, [x15, #0xe8]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr d7, [x15, #0xf0]\n"
-      "mov v6.d[1], x20\n"
-      "ldr x11, [x15, #0xf8]\n"
-      "mov v7.d[1], x11\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr d16, [x15, #0xf0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0xf8]\n"
+      "mov v16.d[1], x20\n"
       "add x12, x12, #0x10\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
       "ldr d6, [x15, #0x0]\n"
       "ldr x20, [x15, #0x8]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "ldr d0, [x12, #0x0]\n"
       "sub x13, x13, #0x10\n"
       "ldr d7, [x15, #0x10]\n"
       "cmp x13, #0x20\n"
-      "ldr x10, [x12, #0x8]\n"
+      "ldr x21, [x12, #0x8]\n"
       "mov v6.d[1], x20\n"
-      "ldr x11, [x15, #0x18]\n"
-      "mov v0.d[1], x10\n"
-      "mov v7.d[1], x11\n"
+      "ldr x20, [x15, #0x18]\n"
+      "mov v0.d[1], x21\n"
+      "mov v7.d[1], x20\n"
       "prfm pldl1keep, [x12, #0x80]\n"
       "bge 7b\n"
       "8:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
+      "ldr q17, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x15, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x15, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x15, #0xf0]\n"
+      "ldr q16, [x15, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x15, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x15, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x15, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x15, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x15, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x15, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x15, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x15, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x15, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x15, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x15, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x15, #0xf0]\n"
       "add x12, x12, #0x10\n"
       "sub x13, x13, #0x10\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
       "prfm pldl1keep, [x12, #0x80]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "add x15, x15, #0x100\n"
       "9:"  // Height 1: Multiply loop: Main loop skip
       "cbz x13, 14f\n"
       "cmp x13, #0x4\n"
       "blt 11f\n"
       "10:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
+      "ldr s18, [x12], #0x4\n"
       "sub x13, x13, #0x4\n"
-      "ldr q6, [x15, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
+      "ldr q16, [x15, #0x0]\n"
+      ".inst 0x4f92e208  // sdot v8.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x15, #0x10]\n"
+      ".inst 0x4f92e209  // sdot v9.4s, v16.16b, v18.4b[0]\n"
+      "ldr q17, [x15, #0x20]\n"
       "cmp x13, #0x4\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q16, [x15, #0x30]\n"
+      ".inst 0x4f92e22a  // sdot v10.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x4f92e20b  // sdot v11.4s, v16.16b, v18.4b[0]\n"
       "add x15, x15, #0x40\n"
       "bge 10b\n"
       "11:"  // Height 1: Multiply loop: Skip odd blocks
@@ -267,28 +266,28 @@
       "12:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x12, #0x0]\n"
       "13:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x15, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q16, [x15, #0x0]\n"
+      ".inst 0x4f80e208  // sdot v8.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x15, #0x10]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x15, #0x20]\n"
+      ".inst 0x4f80e20a  // sdot v10.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x15, #0x30]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
       "add x15, x15, #0x40\n"
       "14:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x14, x14, #0x1\n"
       "cmp x14, x20\n"
       "bne 4b\n"
-      "ldr q0, [x6, #0x0]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "ldr q1, [x6, #0x10]\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q2, [x6, #0x20]\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "ldr q3, [x6, #0x30]\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "ldr q16, [x6, #0x0]\n"
+      "add v8.4s, v8.4s, v16.4s\n"
+      "ldr q16, [x6, #0x10]\n"
+      "add v9.4s, v9.4s, v16.4s\n"
+      "ldr q16, [x6, #0x20]\n"
+      "add v10.4s, v10.4s, v16.4s\n"
+      "ldr q16, [x6, #0x30]\n"
+      "add v11.4s, v11.4s, v16.4s\n"
       "prfm pstl1keep, [x17, #0x0]\n"
       "add x6, x6, #0x40\n"
       "tbz %x[flags], #4, 15f\n"
@@ -304,10 +303,10 @@
       "add x7, x7, #0x40\n"
       "b 16f\n"
       "15:"  // Height 1: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -320,45 +319,45 @@
       "sqrdmulh v10.4s, v10.4s, v6.4s\n"
       "sqrdmulh v11.4s, v11.4s, v7.4s\n"
       "tbz %x[flags], #5, 17f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v19.16b, v8.16b, v0.16b\n"
+      "and v18.16b, v9.16b, v1.16b\n"
+      "and v17.16b, v10.16b, v2.16b\n"
+      "and v16.16b, v11.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v19.4s\n"
+      "sqadd v9.4s, v9.4s, v18.4s\n"
+      "sqadd v10.4s, v10.4s, v17.4s\n"
+      "sqadd v11.4s, v11.4s, v16.4s\n"
       "17:"  // Height 1: no shift correction
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "add v8.4s, v8.4s, v16.4s\n"
+      "add v9.4s, v9.4s, v16.4s\n"
+      "add v10.4s, v10.4s, v16.4s\n"
+      "add v11.4s, v11.4s, v16.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "smin v8.4s, v8.4s, v16.4s\n"
+      "smin v9.4s, v9.4s, v16.4s\n"
+      "smin v10.4s, v10.4s, v16.4s\n"
+      "smin v11.4s, v11.4s, v16.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "smax v8.4s, v8.4s, v16.4s\n"
+      "smax v9.4s, v9.4s, v16.4s\n"
+      "smax v10.4s, v10.4s, v16.4s\n"
+      "smax v11.4s, v11.4s, v16.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v16.8h, v10.8h, v11.8h\n"
       "cmp x16, #0x10\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v8.16b, v8.16b, v16.16b\n"
       "bge 26f\n"
       "tbz x16, #3, 21f\n"
       "str d8, [x17], #0x8\n"
@@ -433,247 +432,247 @@
       "31:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w13, [x20, x14, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 32f\n"
-      "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x12, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x12, [x20, #0x0]\n"
+      "ldr x11, [x20, #0x8]\n"
       "cbnz x14, 33f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x12, x12, x20\n"
-      "add x9, x9, x20\n"
+      "add x11, x11, x20\n"
       "b 33f\n"
       "32:"  // Height 2: setup direct input
       "mov x12, %x[input_ptr]\n"
-      "add x9, x12, x20\n"
+      "add x11, x12, x21\n"
       "33:"  // Height 2: input setup done
       "cmp x13, #0x10\n"
       "blt 36f\n"
       "ldr q0, [x12, #0x0]\n"
       "cmp x13, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
+      "ldr q1, [x11, #0x0]\n"
       "ldr q6, [x15, #0x0]\n"
       "ldr q7, [x15, #0x10]\n"
       "blt 35f\n"
       "34:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x20, [x15, #0x28]\n"
+      "ldr x21, [x15, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr d6, [x15, #0x20]\n"
+      "ldr d17, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr x11, [x15, #0x38]\n"
+      "ldr x20, [x15, #0x38]\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr d7, [x15, #0x30]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr d6, [x15, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr d16, [x15, #0x30]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr d17, [x15, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
       "ldr x20, [x15, #0x48]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr d7, [x15, #0x50]\n"
-      "mov v6.d[1], x20\n"
-      "ldr x11, [x15, #0x58]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr x20, [x15, #0x68]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr d6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr x11, [x15, #0x78]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr d7, [x15, #0x70]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr d6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr d16, [x15, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr x21, [x15, #0x68]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr d17, [x15, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr x20, [x15, #0x78]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr d16, [x15, #0x70]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr d17, [x15, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
       "ldr x20, [x15, #0x88]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr d7, [x15, #0x90]\n"
-      "mov v6.d[1], x20\n"
-      "ldr x11, [x15, #0x98]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr x20, [x15, #0xa8]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr d6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x15, #0xb8]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr d7, [x15, #0xb0]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr d6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr d16, [x15, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr x21, [x15, #0xa8]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr d17, [x15, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr x20, [x15, #0xb8]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr d16, [x15, #0xb0]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr d17, [x15, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
       "ldr x20, [x15, #0xc8]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr d7, [x15, #0xd0]\n"
-      "mov v6.d[1], x20\n"
-      "ldr x11, [x15, #0xd8]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr x20, [x15, #0xe8]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr d6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr x11, [x15, #0xf8]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr d7, [x15, #0xf0]\n"
-      "mov v6.d[1], x20\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr d16, [x15, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x15, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr x21, [x15, #0xe8]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr d17, [x15, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr x20, [x15, #0xf8]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr d16, [x15, #0xf0]\n"
+      "mov v17.d[1], x21\n"
       "add x12, x12, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "add x9, x9, #0x10\n"
+      "mov v16.d[1], x20\n"
+      "add x11, x11, #0x10\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
       "ldr d6, [x15, #0x0]\n"
-      "ldr x20, [x15, #0x8]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      "ldr x21, [x15, #0x8]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "ldr d0, [x12, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
+      "ldr d1, [x11, #0x0]\n"
       "sub x13, x13, #0x10\n"
       "ldr d7, [x15, #0x10]\n"
       "cmp x13, #0x20\n"
-      "ldr x10, [x12, #0x8]\n"
-      "mov v6.d[1], x20\n"
-      "ldr x28, [x9, #0x8]\n"
-      "mov v0.d[1], x10\n"
-      "ldr x11, [x15, #0x18]\n"
-      "mov v1.d[1], x28\n"
+      "ldr x20, [x12, #0x8]\n"
+      "mov v6.d[1], x21\n"
+      "ldr x21, [x11, #0x8]\n"
+      "mov v0.d[1], x20\n"
+      "ldr x20, [x15, #0x18]\n"
+      "mov v1.d[1], x21\n"
       "prfm pldl1keep, [x12, #0x80]\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       "bge 34b\n"
       "35:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
       "add x12, x12, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
+      "ldr q17, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q16, [x15, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
       "sub x13, x13, #0x10\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x15, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x15, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
       "prfm pldl1keep, [x12, #0x80]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x15, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x15, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x15, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x15, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x15, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x15, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x15, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x15, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x15, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x15, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x15, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x15, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x15, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x15, #0xf0]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
       "36:"  // Height 2: Multiply loop: Main loop skip
       "cbz x13, 41f\n"
       "cmp x13, #0x4\n"
       "blt 38f\n"
       "37:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
+      "ldr s19, [x12], #0x4\n"
       "sub x13, x13, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s18, [x11], #0x4\n"
       "cmp x13, #0x4\n"
-      "ldr q6, [x15, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q17, [x15, #0x0]\n"
+      ".inst 0x4f93e228  // sdot v8.4s, v17.16b, v19.4b[0]\n"
+      "ldr q16, [x15, #0x10]\n"
+      ".inst 0x4f92e22c  // sdot v12.4s, v17.16b, v18.4b[0]\n"
+      "ldr q17, [x15, #0x20]\n"
+      ".inst 0x4f93e209  // sdot v9.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20d  // sdot v13.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x15, #0x30]\n"
+      ".inst 0x4f93e22a  // sdot v10.4s, v17.16b, v19.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f92e22e  // sdot v14.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x4f93e20b  // sdot v11.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20f  // sdot v15.4s, v16.16b, v18.4b[0]\n"
       "bge 37b\n"
       "38:"  // Height 2: Multiply loop: Skip odd blocks
       "cbz x13, 41f\n"
       "tbz x13, #1, 39f\n"
       "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
+      "ldr h1, [x11], #0x2\n"
       "tbz x13, #0, 40f\n"
       "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x11]\n"
       "b 40f\n"
       "39:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
+      "ldr b1, [x11, #0x0]\n"
       "40:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x15, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q17, [x15, #0x0]\n"
+      ".inst 0x4f80e228  // sdot v8.4s, v17.16b, v0.4b[0]\n"
+      "ldr q16, [x15, #0x10]\n"
+      ".inst 0x4f81e22c  // sdot v12.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x15, #0x20]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20d  // sdot v13.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x15, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
       "41:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x14, x14, #0x1\n"
       "cmp x14, x20\n"
       "bne 31b\n"
-      "ldr q0, [x6, #0x0]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "ldr q1, [x6, #0x10]\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q2, [x6, #0x20]\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "ldr q3, [x6, #0x30]\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "ldr q19, [x6, #0x0]\n"
+      "add v8.4s, v8.4s, v19.4s\n"
+      "ldr q18, [x6, #0x10]\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "ldr q17, [x6, #0x20]\n"
+      "add v10.4s, v10.4s, v17.4s\n"
+      "ldr q16, [x6, #0x30]\n"
+      "add v11.4s, v11.4s, v16.4s\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x17, x20\n"
+      "add x25, x17, x20\n"
       "prfm pstl1keep, [x17, #0x0]\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "add v15.4s, v15.4s, v3.4s\n"
+      "add v12.4s, v12.4s, v19.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "add v13.4s, v13.4s, v18.4s\n"
+      "add v14.4s, v14.4s, v17.4s\n"
+      "add v15.4s, v15.4s, v16.4s\n"
       "add x6, x6, #0x40\n"
       "tbz %x[flags], #4, 42f\n"
       "ldr q0, [x8, #0x0]\n"
@@ -688,10 +687,10 @@
       "add x7, x7, #0x40\n"
       "b 43f\n"
       "42:"  // Height 2: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -708,30 +707,30 @@
       "sqrdmulh v14.4s, v14.4s, v6.4s\n"
       "sqrdmulh v15.4s, v15.4s, v7.4s\n"
       "tbz %x[flags], #5, 44f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "and v19.16b, v8.16b, v0.16b\n"
+      "and v18.16b, v9.16b, v1.16b\n"
+      "and v17.16b, v10.16b, v2.16b\n"
+      "and v16.16b, v11.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v19.4s\n"
+      "sqadd v9.4s, v9.4s, v18.4s\n"
+      "sqadd v10.4s, v10.4s, v17.4s\n"
+      "sqadd v11.4s, v11.4s, v16.4s\n"
+      "and v19.16b, v12.16b, v0.16b\n"
+      "and v18.16b, v13.16b, v1.16b\n"
+      "and v17.16b, v14.16b, v2.16b\n"
+      "and v16.16b, v15.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v19.4s\n"
+      "sqadd v13.4s, v13.4s, v18.4s\n"
+      "sqadd v14.4s, v14.4s, v17.4s\n"
+      "sqadd v15.4s, v15.4s, v16.4s\n"
       "44:"  // Height 2: no shift correction
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
@@ -741,108 +740,108 @@
       "srshl v13.4s, v13.4s, v1.4s\n"
       "srshl v14.4s, v14.4s, v2.4s\n"
       "srshl v15.4s, v15.4s, v3.4s\n"
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "add v8.4s, v8.4s, v16.4s\n"
+      "add v9.4s, v9.4s, v16.4s\n"
+      "add v10.4s, v10.4s, v16.4s\n"
+      "add v11.4s, v11.4s, v16.4s\n"
+      "add v12.4s, v12.4s, v16.4s\n"
+      "add v13.4s, v13.4s, v16.4s\n"
+      "add v14.4s, v14.4s, v16.4s\n"
+      "add v15.4s, v15.4s, v16.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "smin v8.4s, v8.4s, v16.4s\n"
+      "smin v9.4s, v9.4s, v16.4s\n"
+      "smin v10.4s, v10.4s, v16.4s\n"
+      "smin v11.4s, v11.4s, v16.4s\n"
+      "smin v12.4s, v12.4s, v16.4s\n"
+      "smin v13.4s, v13.4s, v16.4s\n"
+      "smin v14.4s, v14.4s, v16.4s\n"
+      "smin v15.4s, v15.4s, v16.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "smax v8.4s, v8.4s, v16.4s\n"
+      "smax v9.4s, v9.4s, v16.4s\n"
+      "smax v10.4s, v10.4s, v16.4s\n"
+      "smax v11.4s, v11.4s, v16.4s\n"
+      "smax v12.4s, v12.4s, v16.4s\n"
+      "smax v13.4s, v13.4s, v16.4s\n"
+      "smax v14.4s, v14.4s, v16.4s\n"
+      "smax v15.4s, v15.4s, v16.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v17.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v16.8h, v14.8h, v15.8h\n"
       "cmp x16, #0x10\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v8.16b, v8.16b, v17.16b\n"
+      "uzp1 v12.16b, v12.16b, v16.16b\n"
       "bge 53f\n"
       "tbz x16, #3, 48f\n"
       "str d8, [x17], #0x8\n"
-      "str d12, [x24], #0x8\n"
+      "str d12, [x25], #0x8\n"
       "tbz x16, #2, 46f\n"
       "st1 { v8.s }[2], [x17], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
       "tbz x16, #1, 45f\n"
       "st1 { v8.h }[6], [x17], #0x2\n"
-      "st1 { v12.h }[6], [x24], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
       "tbz x16, #0, 52f\n"
       "st1 { v8.b }[14], [x17]\n"
-      "st1 { v12.b }[14], [x24]\n"
+      "st1 { v12.b }[14], [x25]\n"
       "b 52f\n"
       "45:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x16, #0, 52f\n"
       "st1 { v8.b }[12], [x17]\n"
-      "st1 { v12.b }[12], [x24]\n"
+      "st1 { v12.b }[12], [x25]\n"
       "b 52f\n"
       "46:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x16, #1, 47f\n"
       "st1 { v8.h }[4], [x17], #0x2\n"
-      "st1 { v12.h }[4], [x24], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
       "tbz x16, #0, 52f\n"
       "st1 { v8.b }[10], [x17]\n"
-      "st1 { v12.b }[10], [x24]\n"
+      "st1 { v12.b }[10], [x25]\n"
       "b 52f\n"
       "47:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x16, #0, 52f\n"
       "st1 { v8.b }[8], [x17]\n"
-      "st1 { v12.b }[8], [x24]\n"
+      "st1 { v12.b }[8], [x25]\n"
       "b 52f\n"
       "48:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x16, #2, 50f\n"
       "str s8, [x17], #0x4\n"
-      "str s12, [x24], #0x4\n"
+      "str s12, [x25], #0x4\n"
       "tbz x16, #1, 49f\n"
       "st1 { v8.h }[2], [x17], #0x2\n"
-      "st1 { v12.h }[2], [x24], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
       "tbz x16, #0, 52f\n"
       "st1 { v8.b }[6], [x17]\n"
-      "st1 { v12.b }[6], [x24]\n"
+      "st1 { v12.b }[6], [x25]\n"
       "b 52f\n"
       "49:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x16, #0, 52f\n"
       "st1 { v8.b }[4], [x17]\n"
-      "st1 { v12.b }[4], [x24]\n"
+      "st1 { v12.b }[4], [x25]\n"
       "b 52f\n"
       "50:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x16, #1, 51f\n"
       "str h8, [x17], #0x2\n"
-      "str h12, [x24], #0x2\n"
+      "str h12, [x25], #0x2\n"
       "tbz x16, #0, 52f\n"
       "st1 { v8.b }[2], [x17]\n"
-      "st1 { v12.b }[2], [x24]\n"
+      "st1 { v12.b }[2], [x25]\n"
       "b 52f\n"
       "51:"  // Height 2: Partial direct writeback: partial_1_0
       "str b8, [x17, #0x0]\n"
-      "str b12, [x24, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
       "52:"  // Height 2: Partial direct writeback: Done
       "b 54f\n"
       "53:"  // Height 2: Full writeback
       "str q8, [x17, #0x0]\n"
       "add x17, x17, #0x10\n"
-      "str q12, [x24, #0x0]\n"
+      "str q12, [x25, #0x0]\n"
       "54:"  // Height 2: Writeback done
       "subs x16, x16, #0x10\n"
       "bgt 29b\n"
@@ -872,308 +871,308 @@
       "58:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w13, [x20, x14, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 59f\n"
-      "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x12, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x12, [x20, #0x0]\n"
+      "ldr x11, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
       "cbnz x14, 60f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x12, x12, x20\n"
-      "add x9, x9, x20\n"
-      "add x27, x27, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
       "b 60f\n"
       "59:"  // Height 3: setup direct input
       "mov x12, %x[input_ptr]\n"
-      "add x9, x12, x20\n"
-      "add x27, x9, x20\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
       "60:"  // Height 3: input setup done
       "cmp x13, #0x10\n"
       "blt 63f\n"
       "ldr q0, [x12, #0x0]\n"
       "cmp x13, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
+      "ldr q1, [x11, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
       "ldr q6, [x15, #0x0]\n"
       "ldr q7, [x15, #0x10]\n"
       "blt 62f\n"
       "61:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x20, [x15, #0x28]\n"
+      "ldr x21, [x15, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x38]\n"
+      "ldr x20, [x15, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr d6, [x15, #0x20]\n"
+      "ldr d21, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x20\n"
+      "mov v21.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x20, [x15, #0x48]\n"
+      "ldr x21, [x15, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr d7, [x15, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x58]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr d6, [x15, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x20, [x15, #0x68]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr d7, [x15, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x15, #0x78]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr d6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x20, [x15, #0x88]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr d7, [x15, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x15, #0x98]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr d6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x20, [x15, #0xa8]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr d7, [x15, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x15, #0xb8]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr d6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr x20, [x15, #0xc8]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr d7, [x15, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x15, #0xd8]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr d6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr x20, [x15, #0xe8]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr d7, [x15, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x15, #0xf8]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr d6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr d20, [x15, #0x30]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      "ldr x20, [x15, #0x58]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr d21, [x15, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      "ldr x21, [x15, #0x68]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr d20, [x15, #0x50]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      "ldr x20, [x15, #0x78]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr d21, [x15, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      "ldr x21, [x15, #0x88]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr d20, [x15, #0x70]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      "ldr x20, [x15, #0x98]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr d21, [x15, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      "ldr x21, [x15, #0xa8]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr d20, [x15, #0x90]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      "ldr x20, [x15, #0xb8]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr d21, [x15, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      "ldr x21, [x15, #0xc8]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr d20, [x15, #0xb0]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      "ldr x20, [x15, #0xd8]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr d21, [x15, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      "ldr x21, [x15, #0xe8]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr d20, [x15, #0xd0]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      "ldr x20, [x15, #0xf8]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr d21, [x15, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
       "add x12, x12, #0x10\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr d7, [x15, #0xf0]\n"
-      "mov v7.d[1], x11\n"
-      "add x9, x9, #0x10\n"
-      "add x27, x27, #0x10\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr d20, [x15, #0xf0]\n"
+      "mov v20.d[1], x20\n"
+      "add x11, x11, #0x10\n"
+      "add x10, x10, #0x10\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
       "ldr x20, [x15, #0x8]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x10, [x12, #0x8]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      "ldr x23, [x12, #0x8]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
       "ldr d6, [x15, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
       "ldr d0, [x12, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      "ldr x28, [x9, #0x8]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
+      "ldr d1, [x11, #0x0]\n"
+      "ldr x22, [x11, #0x8]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
+      "ldr d2, [x10, #0x0]\n"
       "sub x13, x13, #0x10\n"
       "ldr d7, [x15, #0x10]\n"
       "cmp x13, #0x20\n"
-      "ldr x26, [x27, #0x8]\n"
+      "ldr x21, [x10, #0x8]\n"
       "mov v6.d[1], x20\n"
-      "ldr x11, [x15, #0x18]\n"
-      "mov v0.d[1], x10\n"
+      "ldr x20, [x15, #0x18]\n"
+      "mov v0.d[1], x23\n"
       "prfm pldl1keep, [x12, #0x80]\n"
-      "mov v1.d[1], x28\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "mov v2.d[1], x26\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "mov v7.d[1], x11\n"
+      "mov v1.d[1], x22\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "mov v2.d[1], x21\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "mov v7.d[1], x20\n"
       "bge 61b\n"
       "62:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
       "add x12, x12, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
+      "ldr q21, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
       "sub x13, x13, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q20, [x15, #0x30]\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
       "prfm pldl1keep, [x12, #0x80]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x15, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x15, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x15, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x15, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x15, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x15, #0x50]\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x15, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x15, #0x70]\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x15, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x15, #0x90]\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x15, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x15, #0xb0]\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x15, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x15, #0xd0]\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x15, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x15, #0xf0]\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
       "63:"  // Height 3: Multiply loop: Main loop skip
       "cbz x13, 68f\n"
       "cmp x13, #0x4\n"
       "blt 65f\n"
       "64:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
+      "ldr s24, [x12], #0x4\n"
       "sub x13, x13, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s23, [x11], #0x4\n"
       "cmp x13, #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr q6, [x15, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s22, [x10], #0x4\n"
+      "ldr q21, [x15, #0x0]\n"
+      ".inst 0x4f98e2a8  // sdot v8.4s, v21.16b, v24.4b[0]\n"
+      "ldr q20, [x15, #0x10]\n"
+      ".inst 0x4f97e2ac  // sdot v12.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x4f96e2b0  // sdot v16.4s, v21.16b, v22.4b[0]\n"
+      "ldr q21, [x15, #0x20]\n"
+      ".inst 0x4f98e289  // sdot v9.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28d  // sdot v13.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e291  // sdot v17.4s, v20.16b, v22.4b[0]\n"
+      "ldr q20, [x15, #0x30]\n"
+      ".inst 0x4f98e2aa  // sdot v10.4s, v21.16b, v24.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f97e2ae  // sdot v14.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x4f96e2b2  // sdot v18.4s, v21.16b, v22.4b[0]\n"
+      ".inst 0x4f98e28b  // sdot v11.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28f  // sdot v15.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e293  // sdot v19.4s, v20.16b, v22.4b[0]\n"
       "bge 64b\n"
       "65:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x13, 68f\n"
       "tbz x13, #1, 66f\n"
       "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
+      "ldr h1, [x11], #0x2\n"
+      "ldr h2, [x10], #0x2\n"
       "tbz x13, #0, 67f\n"
       "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x9]\n"
-      "ld1 { v2.b }[2], [x27]\n"
+      "ld1 { v1.b }[2], [x11]\n"
+      "ld1 { v2.b }[2], [x10]\n"
       "b 67f\n"
       "66:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
-      "ldr b2, [x27, #0x0]\n"
+      "ldr b1, [x11, #0x0]\n"
+      "ldr b2, [x10, #0x0]\n"
       "67:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x15, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q21, [x15, #0x0]\n"
+      ".inst 0x4f80e2a8  // sdot v8.4s, v21.16b, v0.4b[0]\n"
+      "ldr q20, [x15, #0x10]\n"
+      ".inst 0x4f81e2ac  // sdot v12.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b0  // sdot v16.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x15, #0x20]\n"
+      ".inst 0x4f80e289  // sdot v9.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28d  // sdot v13.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e291  // sdot v17.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x15, #0x30]\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
       "68:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x14, x14, #0x1\n"
       "cmp x14, x20\n"
       "bne 58b\n"
-      "ldr q0, [x6, #0x0]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "ldr q1, [x6, #0x10]\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q2, [x6, #0x20]\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "ldr q3, [x6, #0x30]\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "ldr q23, [x6, #0x0]\n"
+      "add v8.4s, v8.4s, v23.4s\n"
+      "ldr q22, [x6, #0x10]\n"
+      "add v9.4s, v9.4s, v22.4s\n"
+      "ldr q21, [x6, #0x20]\n"
+      "add v10.4s, v10.4s, v21.4s\n"
+      "ldr q20, [x6, #0x30]\n"
+      "add v11.4s, v11.4s, v20.4s\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x17, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x17, x20\n"
+      "add x24, x25, x20\n"
       "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "add v12.4s, v12.4s, v23.4s\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "add v15.4s, v15.4s, v3.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "add v13.4s, v13.4s, v22.4s\n"
+      "add v14.4s, v14.4s, v21.4s\n"
+      "add v15.4s, v15.4s, v20.4s\n"
+      "add v16.4s, v16.4s, v23.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add v19.4s, v19.4s, v20.4s\n"
       "add x6, x6, #0x40\n"
       "tbz %x[flags], #4, 69f\n"
       "ldr q0, [x8, #0x0]\n"
@@ -1188,10 +1187,10 @@
       "add x7, x7, #0x40\n"
       "b 70f\n"
       "69:"  // Height 3: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -1212,42 +1211,42 @@
       "sqrdmulh v18.4s, v18.4s, v6.4s\n"
       "sqrdmulh v19.4s, v19.4s, v7.4s\n"
       "tbz %x[flags], #5, 71f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v23.16b, v8.16b, v0.16b\n"
+      "and v22.16b, v9.16b, v1.16b\n"
+      "and v21.16b, v10.16b, v2.16b\n"
+      "and v20.16b, v11.16b, v3.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v23.4s\n"
+      "sqadd v9.4s, v9.4s, v22.4s\n"
+      "sqadd v10.4s, v10.4s, v21.4s\n"
+      "sqadd v11.4s, v11.4s, v20.4s\n"
+      "and v23.16b, v12.16b, v0.16b\n"
+      "and v22.16b, v13.16b, v1.16b\n"
+      "and v21.16b, v14.16b, v2.16b\n"
+      "and v20.16b, v15.16b, v3.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v23.4s\n"
+      "sqadd v13.4s, v13.4s, v22.4s\n"
+      "sqadd v14.4s, v14.4s, v21.4s\n"
+      "sqadd v15.4s, v15.4s, v20.4s\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "and v22.16b, v17.16b, v1.16b\n"
+      "and v21.16b, v18.16b, v2.16b\n"
+      "and v20.16b, v19.16b, v3.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "sqadd v17.4s, v17.4s, v22.4s\n"
+      "sqadd v18.4s, v18.4s, v21.4s\n"
+      "sqadd v19.4s, v19.4s, v20.4s\n"
       "71:"  // Height 3: no shift correction
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
@@ -1261,139 +1260,139 @@
       "srshl v17.4s, v17.4s, v1.4s\n"
       "srshl v18.4s, v18.4s, v2.4s\n"
       "srshl v19.4s, v19.4s, v3.4s\n"
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "add v8.4s, v8.4s, v20.4s\n"
+      "add v9.4s, v9.4s, v20.4s\n"
+      "add v10.4s, v10.4s, v20.4s\n"
+      "add v11.4s, v11.4s, v20.4s\n"
+      "add v12.4s, v12.4s, v20.4s\n"
+      "add v13.4s, v13.4s, v20.4s\n"
+      "add v14.4s, v14.4s, v20.4s\n"
+      "add v15.4s, v15.4s, v20.4s\n"
+      "add v16.4s, v16.4s, v20.4s\n"
+      "add v17.4s, v17.4s, v20.4s\n"
+      "add v18.4s, v18.4s, v20.4s\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "smin v8.4s, v8.4s, v20.4s\n"
+      "smin v9.4s, v9.4s, v20.4s\n"
+      "smin v10.4s, v10.4s, v20.4s\n"
+      "smin v11.4s, v11.4s, v20.4s\n"
+      "smin v12.4s, v12.4s, v20.4s\n"
+      "smin v13.4s, v13.4s, v20.4s\n"
+      "smin v14.4s, v14.4s, v20.4s\n"
+      "smin v15.4s, v15.4s, v20.4s\n"
+      "smin v16.4s, v16.4s, v20.4s\n"
+      "smin v17.4s, v17.4s, v20.4s\n"
+      "smin v18.4s, v18.4s, v20.4s\n"
+      "smin v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "smax v8.4s, v8.4s, v20.4s\n"
+      "smax v9.4s, v9.4s, v20.4s\n"
+      "smax v10.4s, v10.4s, v20.4s\n"
+      "smax v11.4s, v11.4s, v20.4s\n"
+      "smax v12.4s, v12.4s, v20.4s\n"
+      "smax v13.4s, v13.4s, v20.4s\n"
+      "smax v14.4s, v14.4s, v20.4s\n"
+      "smax v15.4s, v15.4s, v20.4s\n"
+      "smax v16.4s, v16.4s, v20.4s\n"
+      "smax v17.4s, v17.4s, v20.4s\n"
+      "smax v18.4s, v18.4s, v20.4s\n"
+      "smax v19.4s, v19.4s, v20.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v21.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v20.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
       "cmp x16, #0x10\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v8.16b, v8.16b, v21.16b\n"
+      "uzp1 v12.16b, v12.16b, v20.16b\n"
       "uzp1 v16.16b, v16.16b, v17.16b\n"
       "bge 80f\n"
       "tbz x16, #3, 75f\n"
       "str d8, [x17], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
       "tbz x16, #2, 73f\n"
       "st1 { v8.s }[2], [x17], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
-      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
       "tbz x16, #1, 72f\n"
       "st1 { v8.h }[6], [x17], #0x2\n"
-      "st1 { v12.h }[6], [x24], #0x2\n"
-      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
       "tbz x16, #0, 79f\n"
       "st1 { v8.b }[14], [x17]\n"
-      "st1 { v12.b }[14], [x24]\n"
-      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
       "b 79f\n"
       "72:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x16, #0, 79f\n"
       "st1 { v8.b }[12], [x17]\n"
-      "st1 { v12.b }[12], [x24]\n"
-      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
       "b 79f\n"
       "73:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x16, #1, 74f\n"
       "st1 { v8.h }[4], [x17], #0x2\n"
-      "st1 { v12.h }[4], [x24], #0x2\n"
-      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
       "tbz x16, #0, 79f\n"
       "st1 { v8.b }[10], [x17]\n"
-      "st1 { v12.b }[10], [x24]\n"
-      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
       "b 79f\n"
       "74:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x16, #0, 79f\n"
       "st1 { v8.b }[8], [x17]\n"
-      "st1 { v12.b }[8], [x24]\n"
-      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
       "b 79f\n"
       "75:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x16, #2, 77f\n"
       "str s8, [x17], #0x4\n"
-      "str s12, [x24], #0x4\n"
-      "str s16, [x23], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
       "tbz x16, #1, 76f\n"
       "st1 { v8.h }[2], [x17], #0x2\n"
-      "st1 { v12.h }[2], [x24], #0x2\n"
-      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
       "tbz x16, #0, 79f\n"
       "st1 { v8.b }[6], [x17]\n"
-      "st1 { v12.b }[6], [x24]\n"
-      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
       "b 79f\n"
       "76:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x16, #0, 79f\n"
       "st1 { v8.b }[4], [x17]\n"
-      "st1 { v12.b }[4], [x24]\n"
-      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
       "b 79f\n"
       "77:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x16, #1, 78f\n"
       "str h8, [x17], #0x2\n"
-      "str h12, [x24], #0x2\n"
-      "str h16, [x23], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
       "tbz x16, #0, 79f\n"
       "st1 { v8.b }[2], [x17]\n"
-      "st1 { v12.b }[2], [x24]\n"
-      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
       "b 79f\n"
       "78:"  // Height 3: Partial direct writeback: partial_1_0
       "str b8, [x17, #0x0]\n"
-      "str b12, [x24, #0x0]\n"
-      "str b16, [x23, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
       "79:"  // Height 3: Partial direct writeback: Done
       "b 81f\n"
       "80:"  // Height 3: Full writeback
       "str q8, [x17, #0x0]\n"
       "add x17, x17, #0x10\n"
-      "str q12, [x24, #0x0]\n"
-      "str q16, [x23, #0x0]\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
       "81:"  // Height 3: Writeback done
       "subs x16, x16, #0x10\n"
       "bgt 56b\n"
@@ -1427,369 +1426,369 @@
       "85:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w13, [x20, x14, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 86f\n"
-      "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x12, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x12, [x20, #0x0]\n"
+      "ldr x11, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x9, [x20, #0x18]\n"
       "cbnz x14, 87f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
       "add x9, x9, x20\n"
-      "add x27, x27, x20\n"
-      "add x25, x25, x20\n"
       "b 87f\n"
       "86:"  // Height 4: setup direct input
       "mov x12, %x[input_ptr]\n"
-      "add x9, x12, x20\n"
-      "add x27, x9, x20\n"
-      "add x25, x27, x20\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
+      "add x9, x10, x21\n"
       "87:"  // Height 4: input setup done
       "cmp x13, #0x10\n"
       "blt 90f\n"
       "ldr q0, [x12, #0x0]\n"
       "cmp x13, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
+      "ldr q1, [x11, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x9, #0x0]\n"
       "ldr q6, [x15, #0x0]\n"
       "ldr q7, [x15, #0x10]\n"
       "blt 89f\n"
       "88:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x20, [x15, #0x28]\n"
+      "ldr x21, [x15, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x38]\n"
+      "ldr x20, [x15, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
       "add x12, x12, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr d6, [x15, #0x20]\n"
+      "ldr d25, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x20\n"
+      "mov v25.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x20, [x15, #0x48]\n"
+      "ldr x21, [x15, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr d7, [x15, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x58]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr d6, [x15, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x20, [x15, #0x68]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "add x25, x25, #0x10\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr d7, [x15, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x15, #0x78]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr x10, [x12, #0x8]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr d6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x20, [x15, #0x88]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr x28, [x9, #0x8]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr d7, [x15, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x15, #0x98]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr x26, [x27, #0x8]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr d6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x20, [x15, #0xa8]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr x24, [x25, #0x8]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr d7, [x15, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x15, #0xb8]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr d24, [x15, #0x30]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      "ldr x20, [x15, #0x58]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr d25, [x15, #0x40]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      "ldr x21, [x15, #0x68]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr d24, [x15, #0x50]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      "ldr x20, [x15, #0x78]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      "ldr x25, [x12, #0x8]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr d25, [x15, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      "ldr x21, [x15, #0x88]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr d24, [x15, #0x70]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      "ldr x20, [x15, #0x98]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      "ldr x23, [x10, #0x8]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr d25, [x15, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      "ldr x21, [x15, #0xa8]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      "ldr x22, [x9, #0x8]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr d24, [x15, #0x90]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      "ldr x20, [x15, #0xb8]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
       "sub x13, x13, #0x10\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr d6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr x20, [x15, #0xc8]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr d25, [x15, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      "ldr x21, [x15, #0xc8]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
       "cmp x13, #0x20\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr d7, [x15, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x15, #0xd8]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr d24, [x15, #0xb0]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      "ldr x20, [x15, #0xd8]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
       "prfm pldl1keep, [x12, #0x80]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr d6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr x20, [x15, #0xe8]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr d25, [x15, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      "ldr x21, [x15, #0xe8]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr d24, [x15, #0xd0]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      "ldr x20, [x15, #0xf8]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr d25, [x15, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
       "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr d7, [x15, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x15, #0xf8]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr d6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr d7, [x15, #0xf0]\n"
-      "mov v7.d[1], x11\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr d24, [x15, #0xf0]\n"
+      "mov v24.d[1], x20\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr x20, [x15, #0x8]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x15, #0x18]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
+      "ldr x21, [x15, #0x8]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      "ldr x20, [x15, #0x18]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
       "ldr d6, [x15, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
       "ldr d0, [x12, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr d2, [x27, #0x0]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr d3, [x25, #0x0]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
+      "ldr d1, [x11, #0x0]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
+      "ldr d2, [x10, #0x0]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
+      "ldr d3, [x9, #0x0]\n"
       "ldr d7, [x15, #0x10]\n"
-      "mov v6.d[1], x20\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
-      "mov v3.d[1], x24\n"
-      "mov v7.d[1], x11\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x25\n"
+      "mov v1.d[1], x24\n"
+      "mov v2.d[1], x23\n"
+      "mov v3.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 88b\n"
       "89:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
       "add x12, x12, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
+      "ldr q25, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
       "sub x13, x13, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
       "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q24, [x15, #0x30]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
       "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x15, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x15, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x15, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x15, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x15, #0x40]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x15, #0x50]\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x15, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x15, #0x70]\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x15, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x15, #0x90]\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x15, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x15, #0xb0]\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x15, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x15, #0xd0]\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x15, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x15, #0xf0]\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
       "90:"  // Height 4: Multiply loop: Main loop skip
       "cbz x13, 95f\n"
       "cmp x13, #0x4\n"
       "blt 92f\n"
       "91:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
+      "ldr s29, [x12], #0x4\n"
       "sub x13, x13, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s28, [x11], #0x4\n"
       "cmp x13, #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr q6, [x15, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s27, [x10], #0x4\n"
+      "ldr s26, [x9], #0x4\n"
+      "ldr q25, [x15, #0x0]\n"
+      ".inst 0x4f9de328  // sdot v8.4s, v25.16b, v29.4b[0]\n"
+      "ldr q24, [x15, #0x10]\n"
+      ".inst 0x4f9ce32c  // sdot v12.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be330  // sdot v16.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae334  // sdot v20.4s, v25.16b, v26.4b[0]\n"
+      "ldr q25, [x15, #0x20]\n"
+      ".inst 0x4f9de309  // sdot v9.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30d  // sdot v13.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be311  // sdot v17.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae315  // sdot v21.4s, v24.16b, v26.4b[0]\n"
+      "ldr q24, [x15, #0x30]\n"
+      ".inst 0x4f9de32a  // sdot v10.4s, v25.16b, v29.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f9ce32e  // sdot v14.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be332  // sdot v18.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae336  // sdot v22.4s, v25.16b, v26.4b[0]\n"
+      ".inst 0x4f9de30b  // sdot v11.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30f  // sdot v15.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be313  // sdot v19.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae317  // sdot v23.4s, v24.16b, v26.4b[0]\n"
       "bge 91b\n"
       "92:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x13, 95f\n"
       "tbz x13, #1, 93f\n"
       "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
-      "ldr h3, [x25], #0x2\n"
+      "ldr h1, [x11], #0x2\n"
+      "ldr h2, [x10], #0x2\n"
+      "ldr h3, [x9], #0x2\n"
       "tbz x13, #0, 94f\n"
       "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x9]\n"
-      "ld1 { v2.b }[2], [x27]\n"
-      "ld1 { v3.b }[2], [x25]\n"
+      "ld1 { v1.b }[2], [x11]\n"
+      "ld1 { v2.b }[2], [x10]\n"
+      "ld1 { v3.b }[2], [x9]\n"
       "b 94f\n"
       "93:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
-      "ldr b2, [x27, #0x0]\n"
-      "ldr b3, [x25, #0x0]\n"
+      "ldr b1, [x11, #0x0]\n"
+      "ldr b2, [x10, #0x0]\n"
+      "ldr b3, [x9, #0x0]\n"
       "94:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x15, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q25, [x15, #0x0]\n"
+      ".inst 0x4f80e328  // sdot v8.4s, v25.16b, v0.4b[0]\n"
+      "ldr q24, [x15, #0x10]\n"
+      ".inst 0x4f81e32c  // sdot v12.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e330  // sdot v16.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e334  // sdot v20.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x15, #0x20]\n"
+      ".inst 0x4f80e309  // sdot v9.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30d  // sdot v13.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e311  // sdot v17.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e315  // sdot v21.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x15, #0x30]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
       "95:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x14, x14, #0x1\n"
       "cmp x14, x20\n"
       "bne 85b\n"
-      "ldr q0, [x6, #0x0]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "ldr q1, [x6, #0x10]\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q2, [x6, #0x20]\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "ldr q3, [x6, #0x30]\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "ldr q27, [x6, #0x0]\n"
+      "add v8.4s, v8.4s, v27.4s\n"
+      "ldr q26, [x6, #0x10]\n"
+      "add v9.4s, v9.4s, v26.4s\n"
+      "ldr q25, [x6, #0x20]\n"
+      "add v10.4s, v10.4s, v25.4s\n"
+      "ldr q24, [x6, #0x30]\n"
+      "add v11.4s, v11.4s, v24.4s\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x17, x20\n"
+      "add x25, x17, x20\n"
+      "add x24, x25, x20\n"
       "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
       "prfm pstl1keep, [x17, #0x0]\n"
-      "add v12.4s, v12.4s, v0.4s\n"
+      "add v12.4s, v12.4s, v27.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "add v13.4s, v13.4s, v26.4s\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add v13.4s, v13.4s, v1.4s\n"
+      "add v14.4s, v14.4s, v25.4s\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
-      "add v15.4s, v15.4s, v3.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
+      "add v15.4s, v15.4s, v24.4s\n"
+      "add v16.4s, v16.4s, v27.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v25.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "add v20.4s, v20.4s, v27.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v25.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
       "add x6, x6, #0x40\n"
       "tbz %x[flags], #4, 96f\n"
       "ldr q0, [x8, #0x0]\n"
@@ -1804,10 +1803,10 @@
       "add x7, x7, #0x40\n"
       "b 97f\n"
       "96:"  // Height 4: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -1832,54 +1831,54 @@
       "sqrdmulh v22.4s, v22.4s, v6.4s\n"
       "sqrdmulh v23.4s, v23.4s, v7.4s\n"
       "tbz %x[flags], #5, 98f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "and v6.16b, v22.16b, v2.16b\n"
-      "and v7.16b, v23.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v20.4s, v20.4s, v4.4s\n"
-      "sqadd v21.4s, v21.4s, v5.4s\n"
-      "sqadd v22.4s, v22.4s, v6.4s\n"
-      "sqadd v23.4s, v23.4s, v7.4s\n"
+      "and v27.16b, v8.16b, v0.16b\n"
+      "and v26.16b, v9.16b, v1.16b\n"
+      "and v25.16b, v10.16b, v2.16b\n"
+      "and v24.16b, v11.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v27.4s\n"
+      "sqadd v9.4s, v9.4s, v26.4s\n"
+      "sqadd v10.4s, v10.4s, v25.4s\n"
+      "sqadd v11.4s, v11.4s, v24.4s\n"
+      "and v27.16b, v12.16b, v0.16b\n"
+      "and v26.16b, v13.16b, v1.16b\n"
+      "and v25.16b, v14.16b, v2.16b\n"
+      "and v24.16b, v15.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v27.4s\n"
+      "sqadd v13.4s, v13.4s, v26.4s\n"
+      "sqadd v14.4s, v14.4s, v25.4s\n"
+      "sqadd v15.4s, v15.4s, v24.4s\n"
+      "and v27.16b, v16.16b, v0.16b\n"
+      "and v26.16b, v17.16b, v1.16b\n"
+      "and v25.16b, v18.16b, v2.16b\n"
+      "and v24.16b, v19.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v27.4s\n"
+      "sqadd v17.4s, v17.4s, v26.4s\n"
+      "sqadd v18.4s, v18.4s, v25.4s\n"
+      "sqadd v19.4s, v19.4s, v24.4s\n"
+      "and v27.16b, v20.16b, v0.16b\n"
+      "and v26.16b, v21.16b, v1.16b\n"
+      "and v25.16b, v22.16b, v2.16b\n"
+      "and v24.16b, v23.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v27.4s\n"
+      "sqadd v21.4s, v21.4s, v26.4s\n"
+      "sqadd v22.4s, v22.4s, v25.4s\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
       "98:"  // Height 4: no shift correction
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
@@ -1897,170 +1896,170 @@
       "srshl v21.4s, v21.4s, v1.4s\n"
       "srshl v22.4s, v22.4s, v2.4s\n"
       "srshl v23.4s, v23.4s, v3.4s\n"
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "add v8.4s, v8.4s, v24.4s\n"
+      "add v9.4s, v9.4s, v24.4s\n"
+      "add v10.4s, v10.4s, v24.4s\n"
+      "add v11.4s, v11.4s, v24.4s\n"
+      "add v12.4s, v12.4s, v24.4s\n"
+      "add v13.4s, v13.4s, v24.4s\n"
+      "add v14.4s, v14.4s, v24.4s\n"
+      "add v15.4s, v15.4s, v24.4s\n"
+      "add v16.4s, v16.4s, v24.4s\n"
+      "add v17.4s, v17.4s, v24.4s\n"
+      "add v18.4s, v18.4s, v24.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "add v20.4s, v20.4s, v24.4s\n"
+      "add v21.4s, v21.4s, v24.4s\n"
+      "add v22.4s, v22.4s, v24.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "smin v8.4s, v8.4s, v24.4s\n"
+      "smin v9.4s, v9.4s, v24.4s\n"
+      "smin v10.4s, v10.4s, v24.4s\n"
+      "smin v11.4s, v11.4s, v24.4s\n"
+      "smin v12.4s, v12.4s, v24.4s\n"
+      "smin v13.4s, v13.4s, v24.4s\n"
+      "smin v14.4s, v14.4s, v24.4s\n"
+      "smin v15.4s, v15.4s, v24.4s\n"
+      "smin v16.4s, v16.4s, v24.4s\n"
+      "smin v17.4s, v17.4s, v24.4s\n"
+      "smin v18.4s, v18.4s, v24.4s\n"
+      "smin v19.4s, v19.4s, v24.4s\n"
+      "smin v20.4s, v20.4s, v24.4s\n"
+      "smin v21.4s, v21.4s, v24.4s\n"
+      "smin v22.4s, v22.4s, v24.4s\n"
+      "smin v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "smax v8.4s, v8.4s, v24.4s\n"
+      "smax v9.4s, v9.4s, v24.4s\n"
+      "smax v10.4s, v10.4s, v24.4s\n"
+      "smax v11.4s, v11.4s, v24.4s\n"
+      "smax v12.4s, v12.4s, v24.4s\n"
+      "smax v13.4s, v13.4s, v24.4s\n"
+      "smax v14.4s, v14.4s, v24.4s\n"
+      "smax v15.4s, v15.4s, v24.4s\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v25.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v24.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v17.8h, v22.8h, v23.8h\n"
       "cmp x16, #0x10\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v8.16b, v8.16b, v25.16b\n"
+      "uzp1 v12.16b, v12.16b, v24.16b\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v20.16b, v20.16b, v17.16b\n"
       "bge 107f\n"
       "tbz x16, #3, 102f\n"
       "str d8, [x17], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
       "tbz x16, #2, 100f\n"
       "st1 { v8.s }[2], [x17], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
-      "st1 { v16.s }[2], [x23], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
       "tbz x16, #1, 99f\n"
       "st1 { v8.h }[6], [x17], #0x2\n"
-      "st1 { v12.h }[6], [x24], #0x2\n"
-      "st1 { v16.h }[6], [x23], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
       "tbz x16, #0, 106f\n"
       "st1 { v8.b }[14], [x17]\n"
-      "st1 { v12.b }[14], [x24]\n"
-      "st1 { v16.b }[14], [x23]\n"
-      "st1 { v20.b }[14], [x22]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
+      "st1 { v20.b }[14], [x23]\n"
       "b 106f\n"
       "99:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x16, #0, 106f\n"
       "st1 { v8.b }[12], [x17]\n"
-      "st1 { v12.b }[12], [x24]\n"
-      "st1 { v16.b }[12], [x23]\n"
-      "st1 { v20.b }[12], [x22]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
+      "st1 { v20.b }[12], [x23]\n"
       "b 106f\n"
       "100:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x16, #1, 101f\n"
       "st1 { v8.h }[4], [x17], #0x2\n"
-      "st1 { v12.h }[4], [x24], #0x2\n"
-      "st1 { v16.h }[4], [x23], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
       "tbz x16, #0, 106f\n"
       "st1 { v8.b }[10], [x17]\n"
-      "st1 { v12.b }[10], [x24]\n"
-      "st1 { v16.b }[10], [x23]\n"
-      "st1 { v20.b }[10], [x22]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
+      "st1 { v20.b }[10], [x23]\n"
       "b 106f\n"
       "101:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x16, #0, 106f\n"
       "st1 { v8.b }[8], [x17]\n"
-      "st1 { v12.b }[8], [x24]\n"
-      "st1 { v16.b }[8], [x23]\n"
-      "st1 { v20.b }[8], [x22]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
+      "st1 { v20.b }[8], [x23]\n"
       "b 106f\n"
       "102:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x16, #2, 104f\n"
       "str s8, [x17], #0x4\n"
-      "str s12, [x24], #0x4\n"
-      "str s16, [x23], #0x4\n"
-      "str s20, [x22], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
       "tbz x16, #1, 103f\n"
       "st1 { v8.h }[2], [x17], #0x2\n"
-      "st1 { v12.h }[2], [x24], #0x2\n"
-      "st1 { v16.h }[2], [x23], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
       "tbz x16, #0, 106f\n"
       "st1 { v8.b }[6], [x17]\n"
-      "st1 { v12.b }[6], [x24]\n"
-      "st1 { v16.b }[6], [x23]\n"
-      "st1 { v20.b }[6], [x22]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
+      "st1 { v20.b }[6], [x23]\n"
       "b 106f\n"
       "103:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x16, #0, 106f\n"
       "st1 { v8.b }[4], [x17]\n"
-      "st1 { v12.b }[4], [x24]\n"
-      "st1 { v16.b }[4], [x23]\n"
-      "st1 { v20.b }[4], [x22]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
+      "st1 { v20.b }[4], [x23]\n"
       "b 106f\n"
       "104:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x16, #1, 105f\n"
       "str h8, [x17], #0x2\n"
-      "str h12, [x24], #0x2\n"
-      "str h16, [x23], #0x2\n"
-      "str h20, [x22], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
+      "str h20, [x23], #0x2\n"
       "tbz x16, #0, 106f\n"
       "st1 { v8.b }[2], [x17]\n"
-      "st1 { v12.b }[2], [x24]\n"
-      "st1 { v16.b }[2], [x23]\n"
-      "st1 { v20.b }[2], [x22]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
+      "st1 { v20.b }[2], [x23]\n"
       "b 106f\n"
       "105:"  // Height 4: Partial direct writeback: partial_1_0
       "str b8, [x17, #0x0]\n"
-      "str b12, [x24, #0x0]\n"
-      "str b16, [x23, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
       "106:"  // Height 4: Partial direct writeback: Done
       "b 108f\n"
       "107:"  // Height 4: Full writeback
       "str q8, [x17, #0x0]\n"
       "add x17, x17, #0x10\n"
-      "str q12, [x24, #0x0]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q20, [x22, #0x0]\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
       "108:"  // Height 4: Writeback done
       "subs x16, x16, #0x10\n"
       "bgt 83b\n"
@@ -2098,430 +2097,430 @@
       "112:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w13, [x20, x14, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 113f\n"
-      "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x12, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x23, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x12, [x20, #0x0]\n"
+      "ldr x11, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x9, [x20, #0x18]\n"
+      "ldr x28, [x20, #0x20]\n"
       "cbnz x14, 114f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
       "add x9, x9, x20\n"
-      "add x27, x27, x20\n"
-      "add x25, x25, x20\n"
-      "add x23, x23, x20\n"
+      "add x28, x28, x20\n"
       "b 114f\n"
       "113:"  // Height 5: setup direct input
       "mov x12, %x[input_ptr]\n"
-      "add x9, x12, x20\n"
-      "add x27, x9, x20\n"
-      "add x25, x27, x20\n"
-      "add x23, x25, x20\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
+      "add x9, x10, x21\n"
+      "add x28, x9, x21\n"
       "114:"  // Height 5: input setup done
       "cmp x13, #0x10\n"
       "blt 117f\n"
       "ldr q0, [x12, #0x0]\n"
       "cmp x13, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x23, #0x0]\n"
+      "ldr q1, [x11, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x9, #0x0]\n"
+      "ldr q4, [x28, #0x0]\n"
       "ldr q6, [x15, #0x0]\n"
       "ldr q7, [x15, #0x10]\n"
       "blt 116f\n"
       "115:"  // Height 5: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x20, [x15, #0x28]\n"
+      "ldr x21, [x15, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x38]\n"
+      "ldr x20, [x15, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
       "add x12, x12, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr d6, [x15, #0x20]\n"
+      "ldr d29, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x20\n"
+      "mov v29.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x20, [x15, #0x48]\n"
+      "ldr x21, [x15, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr d7, [x15, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x58]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr x10, [x12, #0x8]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr d6, [x15, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x20, [x15, #0x68]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr x28, [x9, #0x8]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr x26, [x27, #0x8]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr d7, [x15, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x15, #0x78]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr x24, [x25, #0x8]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr x22, [x23, #0x8]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr d6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x20, [x15, #0x88]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr d28, [x15, #0x30]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      "ldr x20, [x15, #0x58]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      "ldr x26, [x12, #0x8]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr d29, [x15, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      "ldr x21, [x15, #0x68]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      "ldr x25, [x11, #0x8]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      "ldr x24, [x10, #0x8]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr d28, [x15, #0x50]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      "ldr x20, [x15, #0x78]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      "ldr x23, [x9, #0x8]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      "ldr x22, [x28, #0x8]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr d29, [x15, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      "ldr x21, [x15, #0x88]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
       "sub x13, x13, #0x10\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
       "cmp x13, #0x20\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr d7, [x15, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x15, #0x98]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr d28, [x15, #0x70]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      "ldr x20, [x15, #0x98]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
       "prfm pldl1keep, [x12, #0x80]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr d29, [x15, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      "ldr x21, [x15, #0xa8]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
       "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr d6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x20, [x15, #0xa8]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr d7, [x15, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x15, #0xb8]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr d6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr x20, [x15, #0xc8]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr d7, [x15, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x15, #0xd8]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr d6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr x20, [x15, #0xe8]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr d7, [x15, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x15, #0xf8]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr d6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x20\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr d7, [x15, #0xf0]\n"
-      "mov v7.d[1], x11\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr d28, [x15, #0x90]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      "ldr x20, [x15, #0xb8]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr d29, [x15, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      "ldr x21, [x15, #0xc8]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr d28, [x15, #0xb0]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      "ldr x20, [x15, #0xd8]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr d29, [x15, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      "ldr x21, [x15, #0xe8]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr d28, [x15, #0xd0]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      "ldr x20, [x15, #0xf8]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr d29, [x15, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr d28, [x15, #0xf0]\n"
+      "mov v28.d[1], x20\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr x20, [x15, #0x8]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x15, #0x18]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
+      "ldr x21, [x15, #0x8]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      "ldr x20, [x15, #0x18]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
       "ldr d6, [x15, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
       "ldr d0, [x12, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr d2, [x27, #0x0]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr d3, [x25, #0x0]\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
-      "ldr d4, [x23, #0x0]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
+      "ldr d1, [x11, #0x0]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
+      "ldr d2, [x10, #0x0]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
+      "ldr d3, [x9, #0x0]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
+      "ldr d4, [x28, #0x0]\n"
       "ldr d7, [x15, #0x10]\n"
-      "mov v6.d[1], x20\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
-      "mov v3.d[1], x24\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x26\n"
+      "mov v1.d[1], x25\n"
+      "mov v2.d[1], x24\n"
+      "mov v3.d[1], x23\n"
       "mov v4.d[1], x22\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "bge 115b\n"
       "116:"  // Height 5: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
       "add x12, x12, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
+      "ldr q29, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
       "sub x13, x13, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
       "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x15, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x15, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x15, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x15, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x15, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x15, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x15, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x15, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x15, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x15, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x15, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x15, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q28, [x15, #0x30]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x15, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x15, #0x50]\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x15, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x15, #0x70]\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x15, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x15, #0x90]\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x15, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x15, #0xb0]\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x15, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x15, #0xd0]\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x15, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x15, #0xf0]\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
       "add x15, x15, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
       "117:"  // Height 5: Multiply loop: Main loop skip
       "cbz x13, 122f\n"
       "cmp x13, #0x4\n"
       "blt 119f\n"
       "118:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
+      "ldr s2, [x12], #0x4\n"
       "sub x13, x13, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s1, [x11], #0x4\n"
       "cmp x13, #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x23], #0x4\n"
-      "ldr q6, [x15, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s0, [x10], #0x4\n"
+      "ldr s31, [x9], #0x4\n"
+      "ldr s30, [x28], #0x4\n"
+      "ldr q29, [x15, #0x0]\n"
+      ".inst 0x4f82e3a8  // sdot v8.4s, v29.16b, v2.4b[0]\n"
+      "ldr q28, [x15, #0x10]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3b0  // sdot v16.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b4  // sdot v20.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3b8  // sdot v24.4s, v29.16b, v30.4b[0]\n"
+      "ldr q29, [x15, #0x20]\n"
+      ".inst 0x4f82e389  // sdot v9.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e391  // sdot v17.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe395  // sdot v21.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee399  // sdot v25.4s, v28.16b, v30.4b[0]\n"
+      "ldr q28, [x15, #0x30]\n"
+      ".inst 0x4f82e3aa  // sdot v10.4s, v29.16b, v2.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b6  // sdot v22.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3ba  // sdot v26.4s, v29.16b, v30.4b[0]\n"
+      ".inst 0x4f82e38b  // sdot v11.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe397  // sdot v23.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee39b  // sdot v27.4s, v28.16b, v30.4b[0]\n"
       "bge 118b\n"
       "119:"  // Height 5: Multiply loop: Skip odd blocks
       "cbz x13, 122f\n"
       "tbz x13, #1, 120f\n"
       "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
-      "ldr h3, [x25], #0x2\n"
-      "ldr h4, [x23], #0x2\n"
+      "ldr h1, [x11], #0x2\n"
+      "ldr h2, [x10], #0x2\n"
+      "ldr h3, [x9], #0x2\n"
+      "ldr h4, [x28], #0x2\n"
       "tbz x13, #0, 121f\n"
       "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x9]\n"
-      "ld1 { v2.b }[2], [x27]\n"
-      "ld1 { v3.b }[2], [x25]\n"
-      "ld1 { v4.b }[2], [x23]\n"
+      "ld1 { v1.b }[2], [x11]\n"
+      "ld1 { v2.b }[2], [x10]\n"
+      "ld1 { v3.b }[2], [x9]\n"
+      "ld1 { v4.b }[2], [x28]\n"
       "b 121f\n"
       "120:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
-      "ldr b2, [x27, #0x0]\n"
-      "ldr b3, [x25, #0x0]\n"
-      "ldr b4, [x23, #0x0]\n"
+      "ldr b1, [x11, #0x0]\n"
+      "ldr b2, [x10, #0x0]\n"
+      "ldr b3, [x9, #0x0]\n"
+      "ldr b4, [x28, #0x0]\n"
       "121:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x15, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q29, [x15, #0x0]\n"
+      ".inst 0x4f80e3a8  // sdot v8.4s, v29.16b, v0.4b[0]\n"
+      "ldr q28, [x15, #0x10]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b0  // sdot v16.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b4  // sdot v20.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3b8  // sdot v24.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x15, #0x20]\n"
+      ".inst 0x4f80e389  // sdot v9.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e391  // sdot v17.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e395  // sdot v21.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e399  // sdot v25.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x15, #0x30]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
       "122:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x14, x14, #0x1\n"
       "cmp x14, x20\n"
       "bne 112b\n"
-      "ldr q0, [x6, #0x0]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "ldr q1, [x6, #0x10]\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q2, [x6, #0x20]\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "ldr q3, [x6, #0x30]\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "ldr q31, [x6, #0x0]\n"
+      "add v8.4s, v8.4s, v31.4s\n"
+      "ldr q30, [x6, #0x10]\n"
+      "add v9.4s, v9.4s, v30.4s\n"
+      "ldr q29, [x6, #0x20]\n"
+      "add v10.4s, v10.4s, v29.4s\n"
+      "ldr q28, [x6, #0x30]\n"
+      "add v11.4s, v11.4s, v28.4s\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x17, x20\n"
+      "add x25, x17, x20\n"
+      "add x24, x25, x20\n"
       "add x23, x24, x20\n"
       "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
       "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "add v12.4s, v12.4s, v31.4s\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add v12.4s, v12.4s, v0.4s\n"
+      "add v13.4s, v13.4s, v30.4s\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add v13.4s, v13.4s, v1.4s\n"
+      "add v14.4s, v14.4s, v29.4s\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
-      "add v15.4s, v15.4s, v3.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
+      "add v15.4s, v15.4s, v28.4s\n"
+      "add v16.4s, v16.4s, v31.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v31.4s\n"
+      "add v21.4s, v21.4s, v30.4s\n"
+      "add v22.4s, v22.4s, v29.4s\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v31.4s\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
       "add x6, x6, #0x40\n"
       "tbz %x[flags], #4, 123f\n"
       "ldr q0, [x8, #0x0]\n"
@@ -2536,10 +2535,10 @@
       "add x7, x7, #0x40\n"
       "b 124f\n"
       "123:"  // Height 5: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -2568,66 +2567,66 @@
       "sqrdmulh v26.4s, v26.4s, v6.4s\n"
       "sqrdmulh v27.4s, v27.4s, v7.4s\n"
       "tbz %x[flags], #5, 125f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "and v6.16b, v22.16b, v2.16b\n"
-      "and v7.16b, v23.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v20.4s, v20.4s, v4.4s\n"
-      "sqadd v21.4s, v21.4s, v5.4s\n"
-      "sqadd v22.4s, v22.4s, v6.4s\n"
-      "sqadd v23.4s, v23.4s, v7.4s\n"
-      "and v4.16b, v24.16b, v0.16b\n"
-      "and v5.16b, v25.16b, v1.16b\n"
-      "and v6.16b, v26.16b, v2.16b\n"
-      "and v7.16b, v27.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v4.4s\n"
-      "sqadd v25.4s, v25.4s, v5.4s\n"
-      "sqadd v26.4s, v26.4s, v6.4s\n"
-      "sqadd v27.4s, v27.4s, v7.4s\n"
+      "and v31.16b, v8.16b, v0.16b\n"
+      "and v30.16b, v9.16b, v1.16b\n"
+      "and v29.16b, v10.16b, v2.16b\n"
+      "and v28.16b, v11.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v31.4s\n"
+      "sqadd v9.4s, v9.4s, v30.4s\n"
+      "sqadd v10.4s, v10.4s, v29.4s\n"
+      "sqadd v11.4s, v11.4s, v28.4s\n"
+      "and v31.16b, v12.16b, v0.16b\n"
+      "and v30.16b, v13.16b, v1.16b\n"
+      "and v29.16b, v14.16b, v2.16b\n"
+      "and v28.16b, v15.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v31.4s\n"
+      "sqadd v13.4s, v13.4s, v30.4s\n"
+      "sqadd v14.4s, v14.4s, v29.4s\n"
+      "sqadd v15.4s, v15.4s, v28.4s\n"
+      "and v31.16b, v16.16b, v0.16b\n"
+      "and v30.16b, v17.16b, v1.16b\n"
+      "and v29.16b, v18.16b, v2.16b\n"
+      "and v28.16b, v19.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v31.4s\n"
+      "sqadd v17.4s, v17.4s, v30.4s\n"
+      "sqadd v18.4s, v18.4s, v29.4s\n"
+      "sqadd v19.4s, v19.4s, v28.4s\n"
+      "and v31.16b, v20.16b, v0.16b\n"
+      "and v30.16b, v21.16b, v1.16b\n"
+      "and v29.16b, v22.16b, v2.16b\n"
+      "and v28.16b, v23.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v31.4s\n"
+      "sqadd v21.4s, v21.4s, v30.4s\n"
+      "sqadd v22.4s, v22.4s, v29.4s\n"
+      "sqadd v23.4s, v23.4s, v28.4s\n"
+      "and v31.16b, v24.16b, v0.16b\n"
+      "and v30.16b, v25.16b, v1.16b\n"
+      "and v29.16b, v26.16b, v2.16b\n"
+      "and v28.16b, v27.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v31.4s\n"
+      "sqadd v25.4s, v25.4s, v30.4s\n"
+      "sqadd v26.4s, v26.4s, v29.4s\n"
+      "sqadd v27.4s, v27.4s, v28.4s\n"
       "125:"  // Height 5: no shift correction
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
@@ -2649,201 +2648,201 @@
       "srshl v25.4s, v25.4s, v1.4s\n"
       "srshl v26.4s, v26.4s, v2.4s\n"
       "srshl v27.4s, v27.4s, v3.4s\n"
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "add v8.4s, v8.4s, v28.4s\n"
+      "add v9.4s, v9.4s, v28.4s\n"
+      "add v10.4s, v10.4s, v28.4s\n"
+      "add v11.4s, v11.4s, v28.4s\n"
+      "add v12.4s, v12.4s, v28.4s\n"
+      "add v13.4s, v13.4s, v28.4s\n"
+      "add v14.4s, v14.4s, v28.4s\n"
+      "add v15.4s, v15.4s, v28.4s\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "add v17.4s, v17.4s, v28.4s\n"
+      "add v18.4s, v18.4s, v28.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v28.4s\n"
+      "add v21.4s, v21.4s, v28.4s\n"
+      "add v22.4s, v22.4s, v28.4s\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v28.4s\n"
+      "add v25.4s, v25.4s, v28.4s\n"
+      "add v26.4s, v26.4s, v28.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "smin v8.4s, v8.4s, v28.4s\n"
+      "smin v9.4s, v9.4s, v28.4s\n"
+      "smin v10.4s, v10.4s, v28.4s\n"
+      "smin v11.4s, v11.4s, v28.4s\n"
+      "smin v12.4s, v12.4s, v28.4s\n"
+      "smin v13.4s, v13.4s, v28.4s\n"
+      "smin v14.4s, v14.4s, v28.4s\n"
+      "smin v15.4s, v15.4s, v28.4s\n"
+      "smin v16.4s, v16.4s, v28.4s\n"
+      "smin v17.4s, v17.4s, v28.4s\n"
+      "smin v18.4s, v18.4s, v28.4s\n"
+      "smin v19.4s, v19.4s, v28.4s\n"
+      "smin v20.4s, v20.4s, v28.4s\n"
+      "smin v21.4s, v21.4s, v28.4s\n"
+      "smin v22.4s, v22.4s, v28.4s\n"
+      "smin v23.4s, v23.4s, v28.4s\n"
+      "smin v24.4s, v24.4s, v28.4s\n"
+      "smin v25.4s, v25.4s, v28.4s\n"
+      "smin v26.4s, v26.4s, v28.4s\n"
+      "smin v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "smax v8.4s, v8.4s, v28.4s\n"
+      "smax v9.4s, v9.4s, v28.4s\n"
+      "smax v10.4s, v10.4s, v28.4s\n"
+      "smax v11.4s, v11.4s, v28.4s\n"
+      "smax v12.4s, v12.4s, v28.4s\n"
+      "smax v13.4s, v13.4s, v28.4s\n"
+      "smax v14.4s, v14.4s, v28.4s\n"
+      "smax v15.4s, v15.4s, v28.4s\n"
+      "smax v16.4s, v16.4s, v28.4s\n"
+      "smax v17.4s, v17.4s, v28.4s\n"
+      "smax v18.4s, v18.4s, v28.4s\n"
+      "smax v19.4s, v19.4s, v28.4s\n"
+      "smax v20.4s, v20.4s, v28.4s\n"
+      "smax v21.4s, v21.4s, v28.4s\n"
+      "smax v22.4s, v22.4s, v28.4s\n"
+      "smax v23.4s, v23.4s, v28.4s\n"
+      "smax v24.4s, v24.4s, v28.4s\n"
+      "smax v25.4s, v25.4s, v28.4s\n"
+      "smax v26.4s, v26.4s, v28.4s\n"
+      "smax v27.4s, v27.4s, v28.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v29.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v28.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v18.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
       "cmp x16, #0x10\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v8.16b, v8.16b, v29.16b\n"
+      "uzp1 v12.16b, v12.16b, v28.16b\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v20.16b, v20.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 134f\n"
       "tbz x16, #3, 129f\n"
       "str d8, [x17], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
       "tbz x16, #2, 127f\n"
       "st1 { v8.s }[2], [x17], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
-      "st1 { v16.s }[2], [x23], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
       "tbz x16, #1, 126f\n"
       "st1 { v8.h }[6], [x17], #0x2\n"
-      "st1 { v12.h }[6], [x24], #0x2\n"
-      "st1 { v16.h }[6], [x23], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
       "tbz x16, #0, 133f\n"
       "st1 { v8.b }[14], [x17]\n"
-      "st1 { v12.b }[14], [x24]\n"
-      "st1 { v16.b }[14], [x23]\n"
-      "st1 { v20.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 133f\n"
       "126:"  // Height 5: Partial direct writeback: partial_1_12
       "tbz x16, #0, 133f\n"
       "st1 { v8.b }[12], [x17]\n"
-      "st1 { v12.b }[12], [x24]\n"
-      "st1 { v16.b }[12], [x23]\n"
-      "st1 { v20.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 133f\n"
       "127:"  // Height 5: Partial direct writeback: partial_2_8
       "tbz x16, #1, 128f\n"
       "st1 { v8.h }[4], [x17], #0x2\n"
-      "st1 { v12.h }[4], [x24], #0x2\n"
-      "st1 { v16.h }[4], [x23], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
       "tbz x16, #0, 133f\n"
       "st1 { v8.b }[10], [x17]\n"
-      "st1 { v12.b }[10], [x24]\n"
-      "st1 { v16.b }[10], [x23]\n"
-      "st1 { v20.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 133f\n"
       "128:"  // Height 5: Partial direct writeback: partial_1_8
       "tbz x16, #0, 133f\n"
       "st1 { v8.b }[8], [x17]\n"
-      "st1 { v12.b }[8], [x24]\n"
-      "st1 { v16.b }[8], [x23]\n"
-      "st1 { v20.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 133f\n"
       "129:"  // Height 5: Partial direct writeback: partial_4_0
       "tbz x16, #2, 131f\n"
       "str s8, [x17], #0x4\n"
-      "str s12, [x24], #0x4\n"
-      "str s16, [x23], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
       "tbz x16, #1, 130f\n"
       "st1 { v8.h }[2], [x17], #0x2\n"
-      "st1 { v12.h }[2], [x24], #0x2\n"
-      "st1 { v16.h }[2], [x23], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
       "tbz x16, #0, 133f\n"
       "st1 { v8.b }[6], [x17]\n"
-      "st1 { v12.b }[6], [x24]\n"
-      "st1 { v16.b }[6], [x23]\n"
-      "st1 { v20.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 133f\n"
       "130:"  // Height 5: Partial direct writeback: partial_1_4
       "tbz x16, #0, 133f\n"
       "st1 { v8.b }[4], [x17]\n"
-      "st1 { v12.b }[4], [x24]\n"
-      "st1 { v16.b }[4], [x23]\n"
-      "st1 { v20.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 133f\n"
       "131:"  // Height 5: Partial direct writeback: partial_2_0
       "tbz x16, #1, 132f\n"
       "str h8, [x17], #0x2\n"
-      "str h12, [x24], #0x2\n"
-      "str h16, [x23], #0x2\n"
-      "str h20, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
       "tbz x16, #0, 133f\n"
       "st1 { v8.b }[2], [x17]\n"
-      "st1 { v12.b }[2], [x24]\n"
-      "st1 { v16.b }[2], [x23]\n"
-      "st1 { v20.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 133f\n"
       "132:"  // Height 5: Partial direct writeback: partial_1_0
       "str b8, [x17, #0x0]\n"
-      "str b12, [x24, #0x0]\n"
-      "str b16, [x23, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "133:"  // Height 5: Partial direct writeback: Done
       "b 135f\n"
       "134:"  // Height 5: Full writeback
       "str q8, [x17, #0x0]\n"
       "add x17, x17, #0x10\n"
-      "str q12, [x24, #0x0]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q20, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "135:"  // Height 5: Writeback done
       "subs x16, x16, #0x10\n"
       "bgt 110b\n"
@@ -2888,191 +2887,191 @@
       "139:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w13, [x20, x14, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 140f\n"
-      "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x12, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x23, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x12, [x20, #0x0]\n"
+      "ldr x11, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x9, [x20, #0x18]\n"
+      "ldr x28, [x20, #0x20]\n"
+      "ldr x27, [x20, #0x28]\n"
       "cbnz x14, 141f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
       "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
       "add x27, x27, x20\n"
-      "add x25, x25, x20\n"
-      "add x23, x23, x20\n"
-      "add x21, x21, x20\n"
       "b 141f\n"
       "140:"  // Height 6: setup direct input
       "mov x12, %x[input_ptr]\n"
-      "add x9, x12, x20\n"
-      "add x27, x9, x20\n"
-      "add x25, x27, x20\n"
-      "add x23, x25, x20\n"
-      "add x21, x23, x20\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
+      "add x9, x10, x21\n"
+      "add x28, x9, x21\n"
+      "add x27, x28, x21\n"
       "141:"  // Height 6: input setup done
       "cmp x13, #0x10\n"
       "blt 144f\n"
       "ldr q0, [x12, #0x0]\n"
       "cmp x13, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x23, #0x0]\n"
-      "ldr q5, [x21, #0x0]\n"
+      "ldr q1, [x11, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x9, #0x0]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x27, #0x0]\n"
       "ldr q6, [x15, #0x0]\n"
       "ldr q7, [x15, #0x10]\n"
       "blt 143f\n"
       "142:"  // Height 6: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x20, [x15, #0x28]\n"
+      "ldr x21, [x15, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x38]\n"
+      "ldr x20, [x15, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
       "add x12, x12, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
       "ldr d6, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x20\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x20, [x15, #0x48]\n"
+      "ldr x21, [x15, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "add x21, x21, #0x10\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
       "ldr d7, [x15, #0x30]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x15, #0x58]\n"
+      "ldr x20, [x15, #0x58]\n"
       ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr x10, [x12, #0x8]\n"
+      "ldr x26, [x12, #0x8]\n"
       ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr x28, [x9, #0x8]\n"
+      "ldr x25, [x11, #0x8]\n"
       ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr x26, [x27, #0x8]\n"
+      "ldr x24, [x10, #0x8]\n"
       ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
       "ldr d6, [x15, #0x40]\n"
       ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x20\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x20, [x15, #0x68]\n"
+      "ldr x21, [x15, #0x68]\n"
       ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr x24, [x25, #0x8]\n"
+      "ldr x23, [x9, #0x8]\n"
       ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
       "sub x13, x13, #0x10\n"
       ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
       "cmp x13, #0x20\n"
       ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
       "ldr d7, [x15, #0x50]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x15, #0x78]\n"
+      "ldr x20, [x15, #0x78]\n"
       ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
       "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
       "ldr d6, [x15, #0x60]\n"
       ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x20\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x20, [x15, #0x88]\n"
+      "ldr x21, [x15, #0x88]\n"
       ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
       "ldr d7, [x15, #0x70]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x15, #0x98]\n"
+      "ldr x20, [x15, #0x98]\n"
       ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
       "ldr d6, [x15, #0x80]\n"
       ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x20\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x20, [x15, #0xa8]\n"
+      "ldr x21, [x15, #0xa8]\n"
       ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
       "ldr d7, [x15, #0x90]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x15, #0xb8]\n"
+      "ldr x20, [x15, #0xb8]\n"
       ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
       "ldr d6, [x15, #0xa0]\n"
       ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x20\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr x20, [x15, #0xc8]\n"
+      "ldr x21, [x15, #0xc8]\n"
       ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
       "ldr d7, [x15, #0xb0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x15, #0xd8]\n"
+      "ldr x20, [x15, #0xd8]\n"
       ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
       "ldr d6, [x15, #0xc0]\n"
       ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x20\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr x20, [x15, #0xe8]\n"
+      "ldr x21, [x15, #0xe8]\n"
       ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
       "ldr d7, [x15, #0xd0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x15, #0xf8]\n"
+      "ldr x20, [x15, #0xf8]\n"
       ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
       "ldr d6, [x15, #0xe0]\n"
       ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x20\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr x22, [x23, #0x8]\n"
+      "ldr x22, [x28, #0x8]\n"
       ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
       "ldr d7, [x15, #0xf0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "add x15, x15, #0x100\n"
       ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
       "ldr x20, [x15, #0x8]\n"
@@ -3085,58 +3084,58 @@
       ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
       "ldr d0, [x12, #0x0]\n"
       ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d1, [x11, #0x0]\n"
       ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "ldr d2, [x10, #0x0]\n"
       ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr d3, [x25, #0x0]\n"
+      "ldr d3, [x9, #0x0]\n"
       ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
-      "ldr d4, [x23, #0x0]\n"
+      "ldr d4, [x28, #0x0]\n"
       ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
-      "ldr d5, [x21, #0x0]\n"
+      "ldr d5, [x27, #0x0]\n"
       "ldr d7, [x15, #0x10]\n"
       "mov v6.d[1], x20\n"
-      "ldr x20, [x21, #0x8]\n"
-      "mov v0.d[1], x10\n"
-      "ldr x11, [x15, #0x18]\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
-      "mov v3.d[1], x24\n"
+      "ldr x21, [x27, #0x8]\n"
+      "mov v0.d[1], x26\n"
+      "ldr x20, [x15, #0x18]\n"
+      "mov v1.d[1], x25\n"
+      "mov v2.d[1], x24\n"
+      "mov v3.d[1], x23\n"
       "mov v4.d[1], x22\n"
-      "mov v5.d[1], x20\n"
-      "mov v7.d[1], x11\n"
+      "mov v5.d[1], x21\n"
+      "mov v7.d[1], x20\n"
       "bge 142b\n"
       "143:"  // Height 6: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
       "add x12, x12, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
       "ldr q6, [x15, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x21, x21, #0x10\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
       "sub x13, x13, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
       "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
       "ldr q7, [x15, #0x30]\n"
       ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
       ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
       ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
@@ -3236,143 +3235,143 @@
       "cmp x13, #0x4\n"
       "blt 146f\n"
       "145:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x12], #0x4\n"
+      "ldr s7, [x12], #0x4\n"
       "sub x13, x13, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s6, [x11], #0x4\n"
       "cmp x13, #0x4\n"
+      "ldr s5, [x10], #0x4\n"
+      "ldr s4, [x9], #0x4\n"
+      "ldr s3, [x28], #0x4\n"
       "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x23], #0x4\n"
-      "ldr s5, [x21], #0x4\n"
-      "ldr q6, [x15, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q1, [x15, #0x0]\n"
+      ".inst 0x4f87e028  // sdot v8.4s, v1.16b, v7.4b[0]\n"
+      "ldr q0, [x15, #0x10]\n"
+      ".inst 0x4f86e02c  // sdot v12.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e030  // sdot v16.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e034  // sdot v20.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e038  // sdot v24.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03c  // sdot v28.4s, v1.16b, v2.4b[0]\n"
+      "ldr q1, [x15, #0x20]\n"
+      ".inst 0x4f87e009  // sdot v9.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00d  // sdot v13.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e011  // sdot v17.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e015  // sdot v21.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e019  // sdot v25.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01d  // sdot v29.4s, v0.16b, v2.4b[0]\n"
+      "ldr q0, [x15, #0x30]\n"
+      ".inst 0x4f87e02a  // sdot v10.4s, v1.16b, v7.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f86e02e  // sdot v14.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e032  // sdot v18.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e036  // sdot v22.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e03a  // sdot v26.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03e  // sdot v30.4s, v1.16b, v2.4b[0]\n"
+      ".inst 0x4f87e00b  // sdot v11.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00f  // sdot v15.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e013  // sdot v19.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e017  // sdot v23.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e01b  // sdot v27.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01f  // sdot v31.4s, v0.16b, v2.4b[0]\n"
       "bge 145b\n"
       "146:"  // Height 6: Multiply loop: Skip odd blocks
       "cbz x13, 149f\n"
       "tbz x13, #1, 147f\n"
       "ldr h0, [x12], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
-      "ldr h3, [x25], #0x2\n"
-      "ldr h4, [x23], #0x2\n"
-      "ldr h5, [x21], #0x2\n"
+      "ldr h1, [x11], #0x2\n"
+      "ldr h2, [x10], #0x2\n"
+      "ldr h3, [x9], #0x2\n"
+      "ldr h4, [x28], #0x2\n"
+      "ldr h5, [x27], #0x2\n"
       "tbz x13, #0, 148f\n"
       "ld1 { v0.b }[2], [x12]\n"
-      "ld1 { v1.b }[2], [x9]\n"
-      "ld1 { v2.b }[2], [x27]\n"
-      "ld1 { v3.b }[2], [x25]\n"
-      "ld1 { v4.b }[2], [x23]\n"
-      "ld1 { v5.b }[2], [x21]\n"
+      "ld1 { v1.b }[2], [x11]\n"
+      "ld1 { v2.b }[2], [x10]\n"
+      "ld1 { v3.b }[2], [x9]\n"
+      "ld1 { v4.b }[2], [x28]\n"
+      "ld1 { v5.b }[2], [x27]\n"
       "b 148f\n"
       "147:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x12, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
-      "ldr b2, [x27, #0x0]\n"
-      "ldr b3, [x25, #0x0]\n"
-      "ldr b4, [x23, #0x0]\n"
-      "ldr b5, [x21, #0x0]\n"
+      "ldr b1, [x11, #0x0]\n"
+      "ldr b2, [x10, #0x0]\n"
+      "ldr b3, [x9, #0x0]\n"
+      "ldr b4, [x28, #0x0]\n"
+      "ldr b5, [x27, #0x0]\n"
       "148:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x15, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x15, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x15, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x15, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x15, #0x0]\n"
+      ".inst 0x4f80e0e8  // sdot v8.4s, v7.16b, v0.4b[0]\n"
+      "ldr q6, [x15, #0x10]\n"
+      ".inst 0x4f81e0ec  // sdot v12.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f0  // sdot v16.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f4  // sdot v20.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f8  // sdot v24.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fc  // sdot v28.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x15, #0x20]\n"
+      ".inst 0x4f80e0c9  // sdot v9.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cd  // sdot v13.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d1  // sdot v17.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d5  // sdot v21.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d9  // sdot v25.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dd  // sdot v29.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x15, #0x30]\n"
+      ".inst 0x4f80e0ea  // sdot v10.4s, v7.16b, v0.4b[0]\n"
       "add x15, x15, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f81e0ee  // sdot v14.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f2  // sdot v18.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f6  // sdot v22.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fa  // sdot v26.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fe  // sdot v30.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0cb  // sdot v11.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cf  // sdot v15.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d3  // sdot v19.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d7  // sdot v23.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0db  // sdot v27.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0df  // sdot v31.4s, v6.16b, v5.4b[0]\n"
       "149:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x14, x14, #0x1\n"
       "cmp x14, x20\n"
       "bne 139b\n"
-      "ldr q0, [x6, #0x0]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "ldr q1, [x6, #0x10]\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q2, [x6, #0x20]\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "ldr q3, [x6, #0x30]\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "ldr q3, [x6, #0x0]\n"
+      "add v8.4s, v8.4s, v3.4s\n"
+      "ldr q2, [x6, #0x10]\n"
+      "add v9.4s, v9.4s, v2.4s\n"
+      "ldr q1, [x6, #0x20]\n"
+      "add v10.4s, v10.4s, v1.4s\n"
+      "ldr q0, [x6, #0x30]\n"
+      "add v11.4s, v11.4s, v0.4s\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x17, x20\n"
+      "add x25, x17, x20\n"
+      "add x24, x25, x20\n"
       "add x23, x24, x20\n"
       "add x22, x23, x20\n"
       "add x21, x22, x20\n"
-      "add x20, x21, x20\n"
       "prfm pstl1keep, [x17, #0x0]\n"
-      "add v12.4s, v12.4s, v0.4s\n"
+      "add v12.4s, v12.4s, v3.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "add v13.4s, v13.4s, v2.4s\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "add v13.4s, v13.4s, v1.4s\n"
+      "add v14.4s, v14.4s, v1.4s\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add v14.4s, v14.4s, v2.4s\n"
+      "add v15.4s, v15.4s, v0.4s\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "add v15.4s, v15.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v3.4s\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "add v28.4s, v28.4s, v0.4s\n"
-      "add v29.4s, v29.4s, v1.4s\n"
-      "add v30.4s, v30.4s, v2.4s\n"
-      "add v31.4s, v31.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v2.4s\n"
+      "add v18.4s, v18.4s, v1.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v1.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v2.4s\n"
+      "add v26.4s, v26.4s, v1.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v2.4s\n"
+      "add v30.4s, v30.4s, v1.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
       "add x6, x6, #0x40\n"
       "tbz %x[flags], #4, 150f\n"
       "ldr q0, [x8, #0x0]\n"
@@ -3387,10 +3386,10 @@
       "add x7, x7, #0x40\n"
       "b 151f\n"
       "150:"  // Height 6: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -3423,78 +3422,78 @@
       "sqrdmulh v30.4s, v30.4s, v6.4s\n"
       "sqrdmulh v31.4s, v31.4s, v7.4s\n"
       "tbz %x[flags], #5, 152f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v7.16b, v8.16b, v0.16b\n"
+      "and v6.16b, v9.16b, v1.16b\n"
+      "and v5.16b, v10.16b, v2.16b\n"
+      "and v4.16b, v11.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "and v6.16b, v22.16b, v2.16b\n"
-      "and v7.16b, v23.16b, v3.16b\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v7.4s\n"
+      "sqadd v9.4s, v9.4s, v6.4s\n"
+      "sqadd v10.4s, v10.4s, v5.4s\n"
+      "sqadd v11.4s, v11.4s, v4.4s\n"
+      "and v7.16b, v12.16b, v0.16b\n"
+      "and v6.16b, v13.16b, v1.16b\n"
+      "and v5.16b, v14.16b, v2.16b\n"
+      "and v4.16b, v15.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v20.4s, v20.4s, v4.4s\n"
-      "sqadd v21.4s, v21.4s, v5.4s\n"
-      "sqadd v22.4s, v22.4s, v6.4s\n"
-      "sqadd v23.4s, v23.4s, v7.4s\n"
-      "and v4.16b, v24.16b, v0.16b\n"
-      "and v5.16b, v25.16b, v1.16b\n"
-      "and v6.16b, v26.16b, v2.16b\n"
-      "and v7.16b, v27.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v7.4s\n"
+      "sqadd v13.4s, v13.4s, v6.4s\n"
+      "sqadd v14.4s, v14.4s, v5.4s\n"
+      "sqadd v15.4s, v15.4s, v4.4s\n"
+      "and v7.16b, v16.16b, v0.16b\n"
+      "and v6.16b, v17.16b, v1.16b\n"
+      "and v5.16b, v18.16b, v2.16b\n"
+      "and v4.16b, v19.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v4.4s\n"
-      "sqadd v25.4s, v25.4s, v5.4s\n"
-      "sqadd v26.4s, v26.4s, v6.4s\n"
-      "sqadd v27.4s, v27.4s, v7.4s\n"
-      "and v4.16b, v28.16b, v0.16b\n"
-      "and v5.16b, v29.16b, v1.16b\n"
-      "and v6.16b, v30.16b, v2.16b\n"
-      "and v7.16b, v31.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v7.4s\n"
+      "sqadd v17.4s, v17.4s, v6.4s\n"
+      "sqadd v18.4s, v18.4s, v5.4s\n"
+      "sqadd v19.4s, v19.4s, v4.4s\n"
+      "and v7.16b, v20.16b, v0.16b\n"
+      "and v6.16b, v21.16b, v1.16b\n"
+      "and v5.16b, v22.16b, v2.16b\n"
+      "and v4.16b, v23.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v28.4s, v28.4s, v4.4s\n"
-      "sqadd v29.4s, v29.4s, v5.4s\n"
-      "sqadd v30.4s, v30.4s, v6.4s\n"
-      "sqadd v31.4s, v31.4s, v7.4s\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v7.4s\n"
+      "sqadd v21.4s, v21.4s, v6.4s\n"
+      "sqadd v22.4s, v22.4s, v5.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v7.16b, v24.16b, v0.16b\n"
+      "and v6.16b, v25.16b, v1.16b\n"
+      "and v5.16b, v26.16b, v2.16b\n"
+      "and v4.16b, v27.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v7.4s\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sqadd v26.4s, v26.4s, v5.4s\n"
+      "sqadd v27.4s, v27.4s, v4.4s\n"
+      "and v7.16b, v28.16b, v0.16b\n"
+      "and v6.16b, v29.16b, v1.16b\n"
+      "and v5.16b, v30.16b, v2.16b\n"
+      "and v4.16b, v31.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v7.4s\n"
+      "sqadd v29.4s, v29.4s, v6.4s\n"
+      "sqadd v30.4s, v30.4s, v5.4s\n"
+      "sqadd v31.4s, v31.4s, v4.4s\n"
       "152:"  // Height 6: no shift correction
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
@@ -3520,232 +3519,232 @@
       "srshl v29.4s, v29.4s, v1.4s\n"
       "srshl v30.4s, v30.4s, v2.4s\n"
       "srshl v31.4s, v31.4s, v3.4s\n"
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "add v9.4s, v9.4s, v0.4s\n"
+      "add v10.4s, v10.4s, v0.4s\n"
+      "add v11.4s, v11.4s, v0.4s\n"
+      "add v12.4s, v12.4s, v0.4s\n"
+      "add v13.4s, v13.4s, v0.4s\n"
+      "add v14.4s, v14.4s, v0.4s\n"
+      "add v15.4s, v15.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v0.4s\n"
+      "add v18.4s, v18.4s, v0.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v0.4s\n"
+      "add v22.4s, v22.4s, v0.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v25.4s, v25.4s, v0.4s\n"
+      "add v26.4s, v26.4s, v0.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v0.4s\n"
+      "add v29.4s, v29.4s, v0.4s\n"
+      "add v30.4s, v30.4s, v0.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "smin v8.4s, v8.4s, v0.4s\n"
+      "smin v9.4s, v9.4s, v0.4s\n"
+      "smin v10.4s, v10.4s, v0.4s\n"
+      "smin v11.4s, v11.4s, v0.4s\n"
+      "smin v12.4s, v12.4s, v0.4s\n"
+      "smin v13.4s, v13.4s, v0.4s\n"
+      "smin v14.4s, v14.4s, v0.4s\n"
+      "smin v15.4s, v15.4s, v0.4s\n"
+      "smin v16.4s, v16.4s, v0.4s\n"
+      "smin v17.4s, v17.4s, v0.4s\n"
+      "smin v18.4s, v18.4s, v0.4s\n"
+      "smin v19.4s, v19.4s, v0.4s\n"
+      "smin v20.4s, v20.4s, v0.4s\n"
+      "smin v21.4s, v21.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v0.4s\n"
+      "smin v23.4s, v23.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v0.4s\n"
+      "smin v25.4s, v25.4s, v0.4s\n"
+      "smin v26.4s, v26.4s, v0.4s\n"
+      "smin v27.4s, v27.4s, v0.4s\n"
+      "smin v28.4s, v28.4s, v0.4s\n"
+      "smin v29.4s, v29.4s, v0.4s\n"
+      "smin v30.4s, v30.4s, v0.4s\n"
+      "smin v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "smax v8.4s, v8.4s, v0.4s\n"
+      "smax v9.4s, v9.4s, v0.4s\n"
+      "smax v10.4s, v10.4s, v0.4s\n"
+      "smax v11.4s, v11.4s, v0.4s\n"
+      "smax v12.4s, v12.4s, v0.4s\n"
+      "smax v13.4s, v13.4s, v0.4s\n"
+      "smax v14.4s, v14.4s, v0.4s\n"
+      "smax v15.4s, v15.4s, v0.4s\n"
+      "smax v16.4s, v16.4s, v0.4s\n"
+      "smax v17.4s, v17.4s, v0.4s\n"
+      "smax v18.4s, v18.4s, v0.4s\n"
+      "smax v19.4s, v19.4s, v0.4s\n"
+      "smax v20.4s, v20.4s, v0.4s\n"
+      "smax v21.4s, v21.4s, v0.4s\n"
+      "smax v22.4s, v22.4s, v0.4s\n"
+      "smax v23.4s, v23.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v0.4s\n"
+      "smax v25.4s, v25.4s, v0.4s\n"
+      "smax v26.4s, v26.4s, v0.4s\n"
+      "smax v27.4s, v27.4s, v0.4s\n"
+      "smax v28.4s, v28.4s, v0.4s\n"
+      "smax v29.4s, v29.4s, v0.4s\n"
+      "smax v30.4s, v30.4s, v0.4s\n"
+      "smax v31.4s, v31.4s, v0.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v2.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v1.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v0.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v19.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v18.8h, v26.8h, v27.8h\n"
       "uzp1 v28.8h, v28.8h, v29.8h\n"
-      "uzp1 v29.8h, v30.8h, v31.8h\n"
+      "uzp1 v17.8h, v30.8h, v31.8h\n"
       "cmp x16, #0x10\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
-      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "uzp1 v8.16b, v8.16b, v2.16b\n"
+      "uzp1 v12.16b, v12.16b, v1.16b\n"
+      "uzp1 v16.16b, v16.16b, v0.16b\n"
+      "uzp1 v20.16b, v20.16b, v19.16b\n"
+      "uzp1 v24.16b, v24.16b, v18.16b\n"
+      "uzp1 v28.16b, v28.16b, v17.16b\n"
       "bge 161f\n"
       "tbz x16, #3, 156f\n"
       "str d8, [x17], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
-      "str d28, [x20], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
       "tbz x16, #2, 154f\n"
       "st1 { v8.s }[2], [x17], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
-      "st1 { v16.s }[2], [x23], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
-      "st1 { v28.s }[2], [x20], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
       "tbz x16, #1, 153f\n"
       "st1 { v8.h }[6], [x17], #0x2\n"
-      "st1 { v12.h }[6], [x24], #0x2\n"
-      "st1 { v16.h }[6], [x23], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
-      "st1 { v28.h }[6], [x20], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
       "tbz x16, #0, 160f\n"
       "st1 { v8.b }[14], [x17]\n"
-      "st1 { v12.b }[14], [x24]\n"
-      "st1 { v16.b }[14], [x23]\n"
-      "st1 { v20.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
-      "st1 { v28.b }[14], [x20]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "st1 { v28.b }[14], [x21]\n"
       "b 160f\n"
       "153:"  // Height 6: Partial direct writeback: partial_1_12
       "tbz x16, #0, 160f\n"
       "st1 { v8.b }[12], [x17]\n"
-      "st1 { v12.b }[12], [x24]\n"
-      "st1 { v16.b }[12], [x23]\n"
-      "st1 { v20.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
-      "st1 { v28.b }[12], [x20]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "st1 { v28.b }[12], [x21]\n"
       "b 160f\n"
       "154:"  // Height 6: Partial direct writeback: partial_2_8
       "tbz x16, #1, 155f\n"
       "st1 { v8.h }[4], [x17], #0x2\n"
-      "st1 { v12.h }[4], [x24], #0x2\n"
-      "st1 { v16.h }[4], [x23], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
-      "st1 { v28.h }[4], [x20], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
       "tbz x16, #0, 160f\n"
       "st1 { v8.b }[10], [x17]\n"
-      "st1 { v12.b }[10], [x24]\n"
-      "st1 { v16.b }[10], [x23]\n"
-      "st1 { v20.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
-      "st1 { v28.b }[10], [x20]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "st1 { v28.b }[10], [x21]\n"
       "b 160f\n"
       "155:"  // Height 6: Partial direct writeback: partial_1_8
       "tbz x16, #0, 160f\n"
       "st1 { v8.b }[8], [x17]\n"
-      "st1 { v12.b }[8], [x24]\n"
-      "st1 { v16.b }[8], [x23]\n"
-      "st1 { v20.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
-      "st1 { v28.b }[8], [x20]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "st1 { v28.b }[8], [x21]\n"
       "b 160f\n"
       "156:"  // Height 6: Partial direct writeback: partial_4_0
       "tbz x16, #2, 158f\n"
       "str s8, [x17], #0x4\n"
-      "str s12, [x24], #0x4\n"
-      "str s16, [x23], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
-      "str s28, [x20], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "str s28, [x21], #0x4\n"
       "tbz x16, #1, 157f\n"
       "st1 { v8.h }[2], [x17], #0x2\n"
-      "st1 { v12.h }[2], [x24], #0x2\n"
-      "st1 { v16.h }[2], [x23], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
-      "st1 { v28.h }[2], [x20], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
       "tbz x16, #0, 160f\n"
       "st1 { v8.b }[6], [x17]\n"
-      "st1 { v12.b }[6], [x24]\n"
-      "st1 { v16.b }[6], [x23]\n"
-      "st1 { v20.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
-      "st1 { v28.b }[6], [x20]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "st1 { v28.b }[6], [x21]\n"
       "b 160f\n"
       "157:"  // Height 6: Partial direct writeback: partial_1_4
       "tbz x16, #0, 160f\n"
       "st1 { v8.b }[4], [x17]\n"
-      "st1 { v12.b }[4], [x24]\n"
-      "st1 { v16.b }[4], [x23]\n"
-      "st1 { v20.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
-      "st1 { v28.b }[4], [x20]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "st1 { v28.b }[4], [x21]\n"
       "b 160f\n"
       "158:"  // Height 6: Partial direct writeback: partial_2_0
       "tbz x16, #1, 159f\n"
       "str h8, [x17], #0x2\n"
-      "str h12, [x24], #0x2\n"
-      "str h16, [x23], #0x2\n"
-      "str h20, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
-      "str h28, [x20], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "str h28, [x21], #0x2\n"
       "tbz x16, #0, 160f\n"
       "st1 { v8.b }[2], [x17]\n"
-      "st1 { v12.b }[2], [x24]\n"
-      "st1 { v16.b }[2], [x23]\n"
-      "st1 { v20.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
-      "st1 { v28.b }[2], [x20]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "st1 { v28.b }[2], [x21]\n"
       "b 160f\n"
       "159:"  // Height 6: Partial direct writeback: partial_1_0
       "str b8, [x17, #0x0]\n"
-      "str b12, [x24, #0x0]\n"
-      "str b16, [x23, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
-      "str b28, [x20, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
       "160:"  // Height 6: Partial direct writeback: Done
       "b 162f\n"
       "161:"  // Height 6: Full writeback
       "str q8, [x17, #0x0]\n"
       "add x17, x17, #0x10\n"
-      "str q12, [x24, #0x0]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q20, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
-      "str q28, [x20, #0x0]\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
       "162:"  // Height 6: Writeback done
       "subs x16, x16, #0x10\n"
       "bgt 137b\n"
@@ -3761,7 +3760,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "164:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
index 598d152..f394232 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
@@ -85,7 +85,6 @@
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 136f\n"
@@ -111,11 +110,11 @@
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -132,37 +131,37 @@
       "blt 8f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
+      "ldr q17, [x9, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x9, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x9, #0xf0]\n"
       "sub x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
       "cmp x27, #0x20\n"
       "add x9, x9, #0x100\n"
@@ -172,37 +171,37 @@
       "bge 7b\n"
       "8:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
+      "ldr q17, [x9, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x9, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x9, #0xf0]\n"
       "add x26, x26, #0x10\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "add x9, x9, #0x100\n"
       "9:"  // Height 1: Multiply loop: Main loop skip
@@ -210,17 +209,17 @@
       "cmp x27, #0x4\n"
       "blt 11f\n"
       "10:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr s18, [x26], #0x4\n"
+      "ldr q16, [x9, #0x0]\n"
+      ".inst 0x4f92e208  // sdot v8.4s, v16.16b, v18.4b[0]\n"
       "sub x27, x27, #0x4\n"
-      "ldr q7, [x9, #0x10]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4f92e209  // sdot v9.4s, v16.16b, v18.4b[0]\n"
       "cmp x27, #0x4\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4f92e22a  // sdot v10.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x4f92e20b  // sdot v11.4s, v16.16b, v18.4b[0]\n"
       "add x9, x9, #0x40\n"
       "bge 10b\n"
       "11:"  // Height 1: Multiply loop: Skip odd blocks
@@ -233,28 +232,28 @@
       "12:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x26, #0x0]\n"
       "13:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q17, [x9, #0x0]\n"
+      "ldr q16, [x9, #0x10]\n"
+      ".inst 0x4f80e228  // sdot v8.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      "ldr q17, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
       "add x9, x9, #0x40\n"
       "14:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 4b\n"
-      "ldr q0, [x14, #0x0]\n"
-      "ldr q1, [x14, #0x10]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q2, [x14, #0x20]\n"
-      "ldr q3, [x14, #0x30]\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "ldr q17, [x14, #0x0]\n"
+      "ldr q16, [x14, #0x10]\n"
+      "add v8.4s, v8.4s, v17.4s\n"
+      "add v9.4s, v9.4s, v16.4s\n"
+      "ldr q17, [x14, #0x20]\n"
+      "ldr q16, [x14, #0x30]\n"
+      "add v10.4s, v10.4s, v17.4s\n"
+      "add v11.4s, v11.4s, v16.4s\n"
       "prfm pstl1keep, [x11, #0x0]\n"
       "add x14, x14, #0x40\n"
       "tbz %x[flags], #4, 15f\n"
@@ -270,10 +269,10 @@
       "add x13, x13, #0x40\n"
       "b 16f\n"
       "15:"  // Height 1: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -286,45 +285,45 @@
       "sqrdmulh v10.4s, v10.4s, v6.4s\n"
       "sqrdmulh v11.4s, v11.4s, v7.4s\n"
       "tbz %x[flags], #5, 17f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v19.16b, v8.16b, v0.16b\n"
+      "and v18.16b, v9.16b, v1.16b\n"
+      "and v17.16b, v10.16b, v2.16b\n"
+      "and v16.16b, v11.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v19.4s\n"
+      "sqadd v9.4s, v9.4s, v18.4s\n"
+      "sqadd v10.4s, v10.4s, v17.4s\n"
+      "sqadd v11.4s, v11.4s, v16.4s\n"
       "17:"  // Height 1: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v18.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add v8.4s, v8.4s, v18.4s\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "add v10.4s, v10.4s, v18.4s\n"
+      "add v11.4s, v11.4s, v18.4s\n"
       "cmp x10, #0x10\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
+      "smin v8.4s, v8.4s, v17.4s\n"
+      "smin v9.4s, v9.4s, v17.4s\n"
+      "smin v10.4s, v10.4s, v17.4s\n"
+      "smin v11.4s, v11.4s, v17.4s\n"
+      "smax v8.4s, v8.4s, v16.4s\n"
+      "smax v9.4s, v9.4s, v16.4s\n"
+      "smax v10.4s, v10.4s, v16.4s\n"
+      "smax v11.4s, v11.4s, v16.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v16.8h, v10.8h, v11.8h\n"
+      "uzp1 v8.16b, v8.16b, v16.16b\n"
       "bge 26f\n"
       "tbz x10, #3, 21f\n"
       "str d8, [x11], #0x8\n"
@@ -399,12 +398,12 @@
       "31:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 32f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 33f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -412,7 +411,7 @@
       "b 33f\n"
       "32:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
+      "add x25, x26, x21\n"
       "33:"  // Height 2: input setup done
       "cmp x27, #0x10\n"
       "blt 36f\n"
@@ -425,137 +424,137 @@
       "34:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
+      "ldr q17, [x9, #0x20]\n"
       "sub x27, x27, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q16, [x9, #0x30]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x9, #0x40]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x9, #0x50]\n"
       "cmp x27, #0x20\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x9, #0x60]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x9, #0x70]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
       "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
       "ldr q1, [x25, #0x0]\n"
       "ldr q7, [x9, #0x10]\n"
       "bge 34b\n"
       "35:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
+      "ldr q17, [x9, #0x20]\n"
       "add x26, x26, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q16, [x9, #0x30]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x9, #0x40]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x9, #0x50]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x9, #0x60]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x9, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
       "36:"  // Height 2: Multiply loop: Main loop skip
       "cbz x27, 41f\n"
       "cmp x27, #0x4\n"
       "blt 38f\n"
       "37:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s19, [x26], #0x4\n"
+      "ldr s18, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q17, [x9, #0x0]\n"
+      "ldr q16, [x9, #0x10]\n"
+      ".inst 0x4f93e228  // sdot v8.4s, v17.16b, v19.4b[0]\n"
+      ".inst 0x4f92e22c  // sdot v12.4s, v17.16b, v18.4b[0]\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4f93e209  // sdot v9.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20d  // sdot v13.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4f93e22a  // sdot v10.4s, v17.16b, v19.4b[0]\n"
+      ".inst 0x4f92e22e  // sdot v14.4s, v17.16b, v18.4b[0]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f93e20b  // sdot v11.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20f  // sdot v15.4s, v16.16b, v18.4b[0]\n"
       "bge 37b\n"
       "38:"  // Height 2: Multiply loop: Skip odd blocks
       "cbz x27, 41f\n"
@@ -570,41 +569,41 @@
       "ldr b0, [x26, #0x0]\n"
       "ldr b1, [x25, #0x0]\n"
       "40:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q17, [x9, #0x0]\n"
+      "ldr q16, [x9, #0x10]\n"
+      ".inst 0x4f80e228  // sdot v8.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22c  // sdot v12.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20d  // sdot v13.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
       "41:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 31b\n"
-      "ldr q0, [x14, #0x0]\n"
-      "ldr q1, [x14, #0x10]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q2, [x14, #0x20]\n"
-      "ldr q3, [x14, #0x30]\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "ldr q19, [x14, #0x0]\n"
+      "ldr q18, [x14, #0x10]\n"
+      "add v8.4s, v8.4s, v19.4s\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "ldr q17, [x14, #0x20]\n"
+      "ldr q16, [x14, #0x30]\n"
+      "add v10.4s, v10.4s, v17.4s\n"
+      "add v11.4s, v11.4s, v16.4s\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x11, x20\n"
+      "add x25, x11, x20\n"
       "prfm pstl1keep, [x11, #0x0]\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
+      "add v12.4s, v12.4s, v19.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "add v13.4s, v13.4s, v18.4s\n"
+      "add v14.4s, v14.4s, v17.4s\n"
       "add x14, x14, #0x40\n"
-      "add v15.4s, v15.4s, v3.4s\n"
+      "add v15.4s, v15.4s, v16.4s\n"
       "tbz %x[flags], #4, 42f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -618,10 +617,10 @@
       "add x13, x13, #0x40\n"
       "b 43f\n"
       "42:"  // Height 2: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -638,141 +637,141 @@
       "sqrdmulh v14.4s, v14.4s, v6.4s\n"
       "sqrdmulh v15.4s, v15.4s, v7.4s\n"
       "tbz %x[flags], #5, 44f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "and v19.16b, v8.16b, v0.16b\n"
+      "and v18.16b, v9.16b, v1.16b\n"
+      "and v17.16b, v10.16b, v2.16b\n"
+      "and v16.16b, v11.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v19.4s\n"
+      "sqadd v9.4s, v9.4s, v18.4s\n"
+      "sqadd v10.4s, v10.4s, v17.4s\n"
+      "sqadd v11.4s, v11.4s, v16.4s\n"
+      "and v19.16b, v12.16b, v0.16b\n"
+      "and v18.16b, v13.16b, v1.16b\n"
+      "and v17.16b, v14.16b, v2.16b\n"
+      "and v16.16b, v15.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v19.4s\n"
+      "sqadd v13.4s, v13.4s, v18.4s\n"
+      "sqadd v14.4s, v14.4s, v17.4s\n"
+      "sqadd v15.4s, v15.4s, v16.4s\n"
       "44:"  // Height 2: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v18.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v17.4s }, [x20]\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
       "srshl v14.4s, v14.4s, v2.4s\n"
       "srshl v15.4s, v15.4s, v3.4s\n"
       "cmp x10, #0x10\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
+      "add v8.4s, v8.4s, v18.4s\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "add v10.4s, v10.4s, v18.4s\n"
+      "add v11.4s, v11.4s, v18.4s\n"
+      "add v12.4s, v12.4s, v18.4s\n"
+      "add v13.4s, v13.4s, v18.4s\n"
+      "add v14.4s, v14.4s, v18.4s\n"
+      "add v15.4s, v15.4s, v18.4s\n"
+      "smin v8.4s, v8.4s, v17.4s\n"
+      "smin v9.4s, v9.4s, v17.4s\n"
+      "smin v10.4s, v10.4s, v17.4s\n"
+      "smin v11.4s, v11.4s, v17.4s\n"
+      "smin v12.4s, v12.4s, v17.4s\n"
+      "smin v13.4s, v13.4s, v17.4s\n"
+      "smin v14.4s, v14.4s, v17.4s\n"
+      "smin v15.4s, v15.4s, v17.4s\n"
+      "smax v8.4s, v8.4s, v16.4s\n"
+      "smax v9.4s, v9.4s, v16.4s\n"
+      "smax v10.4s, v10.4s, v16.4s\n"
+      "smax v11.4s, v11.4s, v16.4s\n"
+      "smax v12.4s, v12.4s, v16.4s\n"
+      "smax v13.4s, v13.4s, v16.4s\n"
+      "smax v14.4s, v14.4s, v16.4s\n"
+      "smax v15.4s, v15.4s, v16.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v17.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v16.8h, v14.8h, v15.8h\n"
+      "uzp1 v8.16b, v8.16b, v17.16b\n"
+      "uzp1 v12.16b, v12.16b, v16.16b\n"
       "bge 53f\n"
       "tbz x10, #3, 48f\n"
       "str d8, [x11], #0x8\n"
-      "str d12, [x24], #0x8\n"
+      "str d12, [x25], #0x8\n"
       "tbz x10, #2, 46f\n"
       "st1 { v8.s }[2], [x11], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
       "tbz x10, #1, 45f\n"
       "st1 { v8.h }[6], [x11], #0x2\n"
-      "st1 { v12.h }[6], [x24], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
       "tbz x10, #0, 52f\n"
       "st1 { v8.b }[14], [x11]\n"
-      "st1 { v12.b }[14], [x24]\n"
+      "st1 { v12.b }[14], [x25]\n"
       "b 52f\n"
       "45:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x10, #0, 52f\n"
       "st1 { v8.b }[12], [x11]\n"
-      "st1 { v12.b }[12], [x24]\n"
+      "st1 { v12.b }[12], [x25]\n"
       "b 52f\n"
       "46:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x10, #1, 47f\n"
       "st1 { v8.h }[4], [x11], #0x2\n"
-      "st1 { v12.h }[4], [x24], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
       "tbz x10, #0, 52f\n"
       "st1 { v8.b }[10], [x11]\n"
-      "st1 { v12.b }[10], [x24]\n"
+      "st1 { v12.b }[10], [x25]\n"
       "b 52f\n"
       "47:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x10, #0, 52f\n"
       "st1 { v8.b }[8], [x11]\n"
-      "st1 { v12.b }[8], [x24]\n"
+      "st1 { v12.b }[8], [x25]\n"
       "b 52f\n"
       "48:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x10, #2, 50f\n"
       "str s8, [x11], #0x4\n"
-      "str s12, [x24], #0x4\n"
+      "str s12, [x25], #0x4\n"
       "tbz x10, #1, 49f\n"
       "st1 { v8.h }[2], [x11], #0x2\n"
-      "st1 { v12.h }[2], [x24], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
       "tbz x10, #0, 52f\n"
       "st1 { v8.b }[6], [x11]\n"
-      "st1 { v12.b }[6], [x24]\n"
+      "st1 { v12.b }[6], [x25]\n"
       "b 52f\n"
       "49:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x10, #0, 52f\n"
       "st1 { v8.b }[4], [x11]\n"
-      "st1 { v12.b }[4], [x24]\n"
+      "st1 { v12.b }[4], [x25]\n"
       "b 52f\n"
       "50:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x10, #1, 51f\n"
       "str h8, [x11], #0x2\n"
-      "str h12, [x24], #0x2\n"
+      "str h12, [x25], #0x2\n"
       "tbz x10, #0, 52f\n"
       "st1 { v8.b }[2], [x11]\n"
-      "st1 { v12.b }[2], [x24]\n"
+      "st1 { v12.b }[2], [x25]\n"
       "b 52f\n"
       "51:"  // Height 2: Partial direct writeback: partial_1_0
       "str b8, [x11, #0x0]\n"
-      "str b12, [x24, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
       "52:"  // Height 2: Partial direct writeback: Done
       "b 54f\n"
       "53:"  // Height 2: Full writeback
       "str q8, [x11, #0x0]\n"
       "add x11, x11, #0x10\n"
-      "str q12, [x24, #0x0]\n"
+      "str q12, [x25, #0x0]\n"
       "54:"  // Height 2: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 29b\n"
@@ -802,13 +801,13 @@
       "58:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 59f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 60f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -817,8 +816,8 @@
       "b 60f\n"
       "59:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "60:"  // Height 3: input setup done
       "cmp x27, #0x10\n"
       "blt 63f\n"
@@ -835,75 +834,75 @@
       "sub x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
+      "ldr q21, [x9, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       "add x25, x25, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q20, [x9, #0x30]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
       "cmp x27, #0x20\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x9, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x9, #0x50]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x9, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x9, #0x70]\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x9, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x9, #0x90]\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x9, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x9, #0xb0]\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x9, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x9, #0xd0]\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x9, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
       "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
       "ldr q2, [x24, #0x0]\n"
       "ldr q7, [x9, #0x10]\n"
       "bge 61b\n"
@@ -913,98 +912,98 @@
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
+      "ldr q21, [x9, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       "add x24, x24, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q20, [x9, #0x30]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x9, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x9, #0x50]\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x9, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x9, #0x70]\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x9, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x9, #0x90]\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x9, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x9, #0xb0]\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x9, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x9, #0xd0]\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x9, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
       "63:"  // Height 3: Multiply loop: Main loop skip
       "cbz x27, 68f\n"
       "cmp x27, #0x4\n"
       "blt 65f\n"
       "64:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr q21, [x9, #0x0]\n"
+      ".inst 0x4f98e2a8  // sdot v8.4s, v21.16b, v24.4b[0]\n"
+      ".inst 0x4f97e2ac  // sdot v12.4s, v21.16b, v23.4b[0]\n"
+      "ldr q20, [x9, #0x10]\n"
+      ".inst 0x4f96e2b0  // sdot v16.4s, v21.16b, v22.4b[0]\n"
+      "ldr q21, [x9, #0x20]\n"
+      ".inst 0x4f98e289  // sdot v9.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28d  // sdot v13.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e291  // sdot v17.4s, v20.16b, v22.4b[0]\n"
+      "ldr q20, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f98e2aa  // sdot v10.4s, v21.16b, v24.4b[0]\n"
+      ".inst 0x4f97e2ae  // sdot v14.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x4f96e2b2  // sdot v18.4s, v21.16b, v22.4b[0]\n"
+      ".inst 0x4f98e28b  // sdot v11.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28f  // sdot v15.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e293  // sdot v19.4s, v20.16b, v22.4b[0]\n"
       "bge 64b\n"
       "65:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x27, 68f\n"
@@ -1022,51 +1021,51 @@
       "ldr b1, [x25, #0x0]\n"
       "ldr b2, [x24, #0x0]\n"
       "67:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q21, [x9, #0x0]\n"
+      "ldr q20, [x9, #0x10]\n"
+      ".inst 0x4f80e2a8  // sdot v8.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ac  // sdot v12.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b0  // sdot v16.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x9, #0x20]\n"
+      ".inst 0x4f80e289  // sdot v9.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28d  // sdot v13.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e291  // sdot v17.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
       "68:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 58b\n"
-      "ldr q0, [x14, #0x0]\n"
-      "ldr q1, [x14, #0x10]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q2, [x14, #0x20]\n"
-      "ldr q3, [x14, #0x30]\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "ldr q23, [x14, #0x0]\n"
+      "ldr q22, [x14, #0x10]\n"
+      "add v8.4s, v8.4s, v23.4s\n"
+      "add v9.4s, v9.4s, v22.4s\n"
+      "ldr q21, [x14, #0x20]\n"
+      "ldr q20, [x14, #0x30]\n"
+      "add v10.4s, v10.4s, v21.4s\n"
+      "add v11.4s, v11.4s, v20.4s\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x11, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x11, x20\n"
+      "add x24, x25, x20\n"
       "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "add v15.4s, v15.4s, v3.4s\n"
+      "add v12.4s, v12.4s, v23.4s\n"
+      "add v13.4s, v13.4s, v22.4s\n"
+      "add v14.4s, v14.4s, v21.4s\n"
+      "add v15.4s, v15.4s, v20.4s\n"
       "add x14, x14, #0x40\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v23.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add v19.4s, v19.4s, v20.4s\n"
       "tbz %x[flags], #4, 69f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -1080,10 +1079,10 @@
       "add x13, x13, #0x40\n"
       "b 70f\n"
       "69:"  // Height 3: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -1104,55 +1103,55 @@
       "sqrdmulh v18.4s, v18.4s, v6.4s\n"
       "sqrdmulh v19.4s, v19.4s, v7.4s\n"
       "tbz %x[flags], #5, 71f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v23.16b, v8.16b, v0.16b\n"
+      "and v22.16b, v9.16b, v1.16b\n"
+      "and v21.16b, v10.16b, v2.16b\n"
+      "and v20.16b, v11.16b, v3.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v23.4s\n"
+      "sqadd v9.4s, v9.4s, v22.4s\n"
+      "sqadd v10.4s, v10.4s, v21.4s\n"
+      "sqadd v11.4s, v11.4s, v20.4s\n"
+      "and v23.16b, v12.16b, v0.16b\n"
+      "and v22.16b, v13.16b, v1.16b\n"
+      "and v21.16b, v14.16b, v2.16b\n"
+      "and v20.16b, v15.16b, v3.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v23.4s\n"
+      "sqadd v13.4s, v13.4s, v22.4s\n"
+      "sqadd v14.4s, v14.4s, v21.4s\n"
+      "sqadd v15.4s, v15.4s, v20.4s\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "and v22.16b, v17.16b, v1.16b\n"
+      "and v21.16b, v18.16b, v2.16b\n"
+      "and v20.16b, v19.16b, v3.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "sqadd v17.4s, v17.4s, v22.4s\n"
+      "sqadd v18.4s, v18.4s, v21.4s\n"
+      "sqadd v19.4s, v19.4s, v20.4s\n"
       "71:"  // Height 3: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v22.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v21.4s }, [x20]\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v20.4s }, [x20]\n"
       "srshl v14.4s, v14.4s, v2.4s\n"
       "srshl v15.4s, v15.4s, v3.4s\n"
       "cmp x10, #0x10\n"
@@ -1160,132 +1159,132 @@
       "srshl v17.4s, v17.4s, v1.4s\n"
       "srshl v18.4s, v18.4s, v2.4s\n"
       "srshl v19.4s, v19.4s, v3.4s\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "add v8.4s, v8.4s, v22.4s\n"
+      "add v9.4s, v9.4s, v22.4s\n"
+      "add v10.4s, v10.4s, v22.4s\n"
+      "add v11.4s, v11.4s, v22.4s\n"
+      "add v12.4s, v12.4s, v22.4s\n"
+      "add v13.4s, v13.4s, v22.4s\n"
+      "add v14.4s, v14.4s, v22.4s\n"
+      "add v15.4s, v15.4s, v22.4s\n"
+      "add v16.4s, v16.4s, v22.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v22.4s\n"
+      "add v19.4s, v19.4s, v22.4s\n"
+      "smin v8.4s, v8.4s, v21.4s\n"
+      "smin v9.4s, v9.4s, v21.4s\n"
+      "smin v10.4s, v10.4s, v21.4s\n"
+      "smin v11.4s, v11.4s, v21.4s\n"
+      "smin v12.4s, v12.4s, v21.4s\n"
+      "smin v13.4s, v13.4s, v21.4s\n"
+      "smin v14.4s, v14.4s, v21.4s\n"
+      "smin v15.4s, v15.4s, v21.4s\n"
+      "smin v16.4s, v16.4s, v21.4s\n"
+      "smin v17.4s, v17.4s, v21.4s\n"
+      "smin v18.4s, v18.4s, v21.4s\n"
+      "smin v19.4s, v19.4s, v21.4s\n"
+      "smax v8.4s, v8.4s, v20.4s\n"
+      "smax v9.4s, v9.4s, v20.4s\n"
+      "smax v10.4s, v10.4s, v20.4s\n"
+      "smax v11.4s, v11.4s, v20.4s\n"
+      "smax v12.4s, v12.4s, v20.4s\n"
+      "smax v13.4s, v13.4s, v20.4s\n"
+      "smax v14.4s, v14.4s, v20.4s\n"
+      "smax v15.4s, v15.4s, v20.4s\n"
+      "smax v16.4s, v16.4s, v20.4s\n"
+      "smax v17.4s, v17.4s, v20.4s\n"
+      "smax v18.4s, v18.4s, v20.4s\n"
+      "smax v19.4s, v19.4s, v20.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v21.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v20.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v8.16b, v8.16b, v21.16b\n"
+      "uzp1 v12.16b, v12.16b, v20.16b\n"
       "uzp1 v16.16b, v16.16b, v17.16b\n"
       "bge 80f\n"
       "tbz x10, #3, 75f\n"
       "str d8, [x11], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
       "tbz x10, #2, 73f\n"
       "st1 { v8.s }[2], [x11], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
-      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
       "tbz x10, #1, 72f\n"
       "st1 { v8.h }[6], [x11], #0x2\n"
-      "st1 { v12.h }[6], [x24], #0x2\n"
-      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
       "tbz x10, #0, 79f\n"
       "st1 { v8.b }[14], [x11]\n"
-      "st1 { v12.b }[14], [x24]\n"
-      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
       "b 79f\n"
       "72:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x10, #0, 79f\n"
       "st1 { v8.b }[12], [x11]\n"
-      "st1 { v12.b }[12], [x24]\n"
-      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
       "b 79f\n"
       "73:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x10, #1, 74f\n"
       "st1 { v8.h }[4], [x11], #0x2\n"
-      "st1 { v12.h }[4], [x24], #0x2\n"
-      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
       "tbz x10, #0, 79f\n"
       "st1 { v8.b }[10], [x11]\n"
-      "st1 { v12.b }[10], [x24]\n"
-      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
       "b 79f\n"
       "74:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x10, #0, 79f\n"
       "st1 { v8.b }[8], [x11]\n"
-      "st1 { v12.b }[8], [x24]\n"
-      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
       "b 79f\n"
       "75:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x10, #2, 77f\n"
       "str s8, [x11], #0x4\n"
-      "str s12, [x24], #0x4\n"
-      "str s16, [x23], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
       "tbz x10, #1, 76f\n"
       "st1 { v8.h }[2], [x11], #0x2\n"
-      "st1 { v12.h }[2], [x24], #0x2\n"
-      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
       "tbz x10, #0, 79f\n"
       "st1 { v8.b }[6], [x11]\n"
-      "st1 { v12.b }[6], [x24]\n"
-      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
       "b 79f\n"
       "76:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x10, #0, 79f\n"
       "st1 { v8.b }[4], [x11]\n"
-      "st1 { v12.b }[4], [x24]\n"
-      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
       "b 79f\n"
       "77:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x10, #1, 78f\n"
       "str h8, [x11], #0x2\n"
-      "str h12, [x24], #0x2\n"
-      "str h16, [x23], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
       "tbz x10, #0, 79f\n"
       "st1 { v8.b }[2], [x11]\n"
-      "st1 { v12.b }[2], [x24]\n"
-      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
       "b 79f\n"
       "78:"  // Height 3: Partial direct writeback: partial_1_0
       "str b8, [x11, #0x0]\n"
-      "str b12, [x24, #0x0]\n"
-      "str b16, [x23, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
       "79:"  // Height 3: Partial direct writeback: Done
       "b 81f\n"
       "80:"  // Height 3: Full writeback
       "str q8, [x11, #0x0]\n"
       "add x11, x11, #0x10\n"
-      "str q12, [x24, #0x0]\n"
-      "str q16, [x23, #0x0]\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
       "81:"  // Height 3: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 56b\n"
@@ -1319,14 +1318,14 @@
       "85:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 86f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 87f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1336,9 +1335,9 @@
       "b 87f\n"
       "86:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "87:"  // Height 4: input setup done
       "cmp x27, #0x10\n"
       "blt 90f\n"
@@ -1357,7 +1356,7 @@
       "add x26, x26, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
+      "ldr q25, [x9, #0x20]\n"
       "add x25, x25, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1365,85 +1364,85 @@
       "add x23, x23, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q24, [x9, #0x30]\n"
       "cmp x27, #0x20\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x9, #0x40]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x9, #0x70]\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x9, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x9, #0x90]\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x9, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x9, #0xb0]\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x9, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x9, #0xd0]\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x9, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
       "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
       "ldr q2, [x24, #0x0]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
       "ldr q3, [x23, #0x0]\n"
       "ldr q7, [x9, #0x10]\n"
       "bge 88b\n"
@@ -1454,7 +1453,7 @@
       "add x25, x25, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
+      "ldr q25, [x9, #0x20]\n"
       "add x24, x24, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1462,112 +1461,112 @@
       "sub x27, x27, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q24, [x9, #0x30]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x9, #0x40]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x9, #0x70]\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x9, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x9, #0x90]\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x9, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x9, #0xb0]\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x9, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x9, #0xd0]\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x9, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
       "90:"  // Height 4: Multiply loop: Main loop skip
       "cbz x27, 95f\n"
       "cmp x27, #0x4\n"
       "blt 92f\n"
       "91:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr q25, [x9, #0x0]\n"
+      "ldr q24, [x9, #0x10]\n"
+      ".inst 0x4f9de328  // sdot v8.4s, v25.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce32c  // sdot v12.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be330  // sdot v16.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae334  // sdot v20.4s, v25.16b, v26.4b[0]\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4f9de309  // sdot v9.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30d  // sdot v13.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be311  // sdot v17.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae315  // sdot v21.4s, v24.16b, v26.4b[0]\n"
+      "ldr q24, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f9de32a  // sdot v10.4s, v25.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce32e  // sdot v14.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be332  // sdot v18.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae336  // sdot v22.4s, v25.16b, v26.4b[0]\n"
+      ".inst 0x4f9de30b  // sdot v11.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30f  // sdot v15.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be313  // sdot v19.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae317  // sdot v23.4s, v24.16b, v26.4b[0]\n"
       "bge 91b\n"
       "92:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x27, 95f\n"
@@ -1588,61 +1587,61 @@
       "ldr b2, [x24, #0x0]\n"
       "ldr b3, [x23, #0x0]\n"
       "94:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q25, [x9, #0x0]\n"
+      "ldr q24, [x9, #0x10]\n"
+      ".inst 0x4f80e328  // sdot v8.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32c  // sdot v12.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e330  // sdot v16.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e334  // sdot v20.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4f80e309  // sdot v9.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30d  // sdot v13.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e311  // sdot v17.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e315  // sdot v21.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
       "95:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 85b\n"
-      "ldr q0, [x14, #0x0]\n"
-      "ldr q1, [x14, #0x10]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q2, [x14, #0x20]\n"
-      "ldr q3, [x14, #0x30]\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "ldr q27, [x14, #0x0]\n"
+      "ldr q26, [x14, #0x10]\n"
+      "add v8.4s, v8.4s, v27.4s\n"
+      "add v9.4s, v9.4s, v26.4s\n"
+      "ldr q25, [x14, #0x20]\n"
+      "ldr q24, [x14, #0x30]\n"
+      "add v10.4s, v10.4s, v25.4s\n"
+      "add v11.4s, v11.4s, v24.4s\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x11, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x11, x20\n"
+      "add x24, x25, x20\n"
       "prfm pstl1keep, [x11, #0x0]\n"
-      "add x22, x23, x20\n"
+      "add x23, x24, x20\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
+      "add v12.4s, v12.4s, v27.4s\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
+      "add v13.4s, v13.4s, v26.4s\n"
+      "add v14.4s, v14.4s, v25.4s\n"
       "add x14, x14, #0x40\n"
-      "add v15.4s, v15.4s, v3.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
+      "add v15.4s, v15.4s, v24.4s\n"
+      "add v16.4s, v16.4s, v27.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v25.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "add v20.4s, v20.4s, v27.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v25.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
       "tbz %x[flags], #4, 96f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -1656,10 +1655,10 @@
       "add x13, x13, #0x40\n"
       "b 97f\n"
       "96:"  // Height 4: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -1684,67 +1683,67 @@
       "sqrdmulh v22.4s, v22.4s, v6.4s\n"
       "sqrdmulh v23.4s, v23.4s, v7.4s\n"
       "tbz %x[flags], #5, 98f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "and v6.16b, v22.16b, v2.16b\n"
-      "and v7.16b, v23.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v20.4s, v20.4s, v4.4s\n"
-      "sqadd v21.4s, v21.4s, v5.4s\n"
-      "sqadd v22.4s, v22.4s, v6.4s\n"
-      "sqadd v23.4s, v23.4s, v7.4s\n"
+      "and v27.16b, v8.16b, v0.16b\n"
+      "and v26.16b, v9.16b, v1.16b\n"
+      "and v25.16b, v10.16b, v2.16b\n"
+      "and v24.16b, v11.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v27.4s\n"
+      "sqadd v9.4s, v9.4s, v26.4s\n"
+      "sqadd v10.4s, v10.4s, v25.4s\n"
+      "sqadd v11.4s, v11.4s, v24.4s\n"
+      "and v27.16b, v12.16b, v0.16b\n"
+      "and v26.16b, v13.16b, v1.16b\n"
+      "and v25.16b, v14.16b, v2.16b\n"
+      "and v24.16b, v15.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v27.4s\n"
+      "sqadd v13.4s, v13.4s, v26.4s\n"
+      "sqadd v14.4s, v14.4s, v25.4s\n"
+      "sqadd v15.4s, v15.4s, v24.4s\n"
+      "and v27.16b, v16.16b, v0.16b\n"
+      "and v26.16b, v17.16b, v1.16b\n"
+      "and v25.16b, v18.16b, v2.16b\n"
+      "and v24.16b, v19.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v27.4s\n"
+      "sqadd v17.4s, v17.4s, v26.4s\n"
+      "sqadd v18.4s, v18.4s, v25.4s\n"
+      "sqadd v19.4s, v19.4s, v24.4s\n"
+      "and v27.16b, v20.16b, v0.16b\n"
+      "and v26.16b, v21.16b, v1.16b\n"
+      "and v25.16b, v22.16b, v2.16b\n"
+      "and v24.16b, v23.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v27.4s\n"
+      "sqadd v21.4s, v21.4s, v26.4s\n"
+      "sqadd v22.4s, v22.4s, v25.4s\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
       "98:"  // Height 4: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v25.4s }, [x20]\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "srshl v14.4s, v14.4s, v2.4s\n"
       "srshl v15.4s, v15.4s, v3.4s\n"
       "cmp x10, #0x10\n"
@@ -1756,163 +1755,163 @@
       "srshl v21.4s, v21.4s, v1.4s\n"
       "srshl v22.4s, v22.4s, v2.4s\n"
       "srshl v23.4s, v23.4s, v3.4s\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v8.4s, v8.4s, v26.4s\n"
+      "add v9.4s, v9.4s, v26.4s\n"
+      "add v10.4s, v10.4s, v26.4s\n"
+      "add v11.4s, v11.4s, v26.4s\n"
+      "add v12.4s, v12.4s, v26.4s\n"
+      "add v13.4s, v13.4s, v26.4s\n"
+      "add v14.4s, v14.4s, v26.4s\n"
+      "add v15.4s, v15.4s, v26.4s\n"
+      "add v16.4s, v16.4s, v26.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v26.4s\n"
+      "add v20.4s, v20.4s, v26.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v23.4s, v23.4s, v26.4s\n"
+      "smin v8.4s, v8.4s, v25.4s\n"
+      "smin v9.4s, v9.4s, v25.4s\n"
+      "smin v10.4s, v10.4s, v25.4s\n"
+      "smin v11.4s, v11.4s, v25.4s\n"
+      "smin v12.4s, v12.4s, v25.4s\n"
+      "smin v13.4s, v13.4s, v25.4s\n"
+      "smin v14.4s, v14.4s, v25.4s\n"
+      "smin v15.4s, v15.4s, v25.4s\n"
+      "smin v16.4s, v16.4s, v25.4s\n"
+      "smin v17.4s, v17.4s, v25.4s\n"
+      "smin v18.4s, v18.4s, v25.4s\n"
+      "smin v19.4s, v19.4s, v25.4s\n"
+      "smin v20.4s, v20.4s, v25.4s\n"
+      "smin v21.4s, v21.4s, v25.4s\n"
+      "smin v22.4s, v22.4s, v25.4s\n"
+      "smin v23.4s, v23.4s, v25.4s\n"
+      "smax v8.4s, v8.4s, v24.4s\n"
+      "smax v9.4s, v9.4s, v24.4s\n"
+      "smax v10.4s, v10.4s, v24.4s\n"
+      "smax v11.4s, v11.4s, v24.4s\n"
+      "smax v12.4s, v12.4s, v24.4s\n"
+      "smax v13.4s, v13.4s, v24.4s\n"
+      "smax v14.4s, v14.4s, v24.4s\n"
+      "smax v15.4s, v15.4s, v24.4s\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v25.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v24.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v17.8h, v22.8h, v23.8h\n"
+      "uzp1 v8.16b, v8.16b, v25.16b\n"
+      "uzp1 v12.16b, v12.16b, v24.16b\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v20.16b, v20.16b, v17.16b\n"
       "bge 107f\n"
       "tbz x10, #3, 102f\n"
       "str d8, [x11], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
       "tbz x10, #2, 100f\n"
       "st1 { v8.s }[2], [x11], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
-      "st1 { v16.s }[2], [x23], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
       "tbz x10, #1, 99f\n"
       "st1 { v8.h }[6], [x11], #0x2\n"
-      "st1 { v12.h }[6], [x24], #0x2\n"
-      "st1 { v16.h }[6], [x23], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
       "tbz x10, #0, 106f\n"
       "st1 { v8.b }[14], [x11]\n"
-      "st1 { v12.b }[14], [x24]\n"
-      "st1 { v16.b }[14], [x23]\n"
-      "st1 { v20.b }[14], [x22]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
+      "st1 { v20.b }[14], [x23]\n"
       "b 106f\n"
       "99:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x10, #0, 106f\n"
       "st1 { v8.b }[12], [x11]\n"
-      "st1 { v12.b }[12], [x24]\n"
-      "st1 { v16.b }[12], [x23]\n"
-      "st1 { v20.b }[12], [x22]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
+      "st1 { v20.b }[12], [x23]\n"
       "b 106f\n"
       "100:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x10, #1, 101f\n"
       "st1 { v8.h }[4], [x11], #0x2\n"
-      "st1 { v12.h }[4], [x24], #0x2\n"
-      "st1 { v16.h }[4], [x23], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
       "tbz x10, #0, 106f\n"
       "st1 { v8.b }[10], [x11]\n"
-      "st1 { v12.b }[10], [x24]\n"
-      "st1 { v16.b }[10], [x23]\n"
-      "st1 { v20.b }[10], [x22]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
+      "st1 { v20.b }[10], [x23]\n"
       "b 106f\n"
       "101:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x10, #0, 106f\n"
       "st1 { v8.b }[8], [x11]\n"
-      "st1 { v12.b }[8], [x24]\n"
-      "st1 { v16.b }[8], [x23]\n"
-      "st1 { v20.b }[8], [x22]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
+      "st1 { v20.b }[8], [x23]\n"
       "b 106f\n"
       "102:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x10, #2, 104f\n"
       "str s8, [x11], #0x4\n"
-      "str s12, [x24], #0x4\n"
-      "str s16, [x23], #0x4\n"
-      "str s20, [x22], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
       "tbz x10, #1, 103f\n"
       "st1 { v8.h }[2], [x11], #0x2\n"
-      "st1 { v12.h }[2], [x24], #0x2\n"
-      "st1 { v16.h }[2], [x23], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
       "tbz x10, #0, 106f\n"
       "st1 { v8.b }[6], [x11]\n"
-      "st1 { v12.b }[6], [x24]\n"
-      "st1 { v16.b }[6], [x23]\n"
-      "st1 { v20.b }[6], [x22]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
+      "st1 { v20.b }[6], [x23]\n"
       "b 106f\n"
       "103:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x10, #0, 106f\n"
       "st1 { v8.b }[4], [x11]\n"
-      "st1 { v12.b }[4], [x24]\n"
-      "st1 { v16.b }[4], [x23]\n"
-      "st1 { v20.b }[4], [x22]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
+      "st1 { v20.b }[4], [x23]\n"
       "b 106f\n"
       "104:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x10, #1, 105f\n"
       "str h8, [x11], #0x2\n"
-      "str h12, [x24], #0x2\n"
-      "str h16, [x23], #0x2\n"
-      "str h20, [x22], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
+      "str h20, [x23], #0x2\n"
       "tbz x10, #0, 106f\n"
       "st1 { v8.b }[2], [x11]\n"
-      "st1 { v12.b }[2], [x24]\n"
-      "st1 { v16.b }[2], [x23]\n"
-      "st1 { v20.b }[2], [x22]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
+      "st1 { v20.b }[2], [x23]\n"
       "b 106f\n"
       "105:"  // Height 4: Partial direct writeback: partial_1_0
       "str b8, [x11, #0x0]\n"
-      "str b12, [x24, #0x0]\n"
-      "str b16, [x23, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
       "106:"  // Height 4: Partial direct writeback: Done
       "b 108f\n"
       "107:"  // Height 4: Full writeback
       "str q8, [x11, #0x0]\n"
       "add x11, x11, #0x10\n"
-      "str q12, [x24, #0x0]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q20, [x22, #0x0]\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
       "108:"  // Height 4: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 83b\n"
@@ -1950,15 +1949,15 @@
       "112:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 113f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 114f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1969,10 +1968,10 @@
       "b 114f\n"
       "113:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "114:"  // Height 5: input setup done
       "cmp x27, #0x10\n"
       "blt 117f\n"
@@ -1995,7 +1994,7 @@
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
+      "ldr q29, [x9, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2004,100 +2003,100 @@
       "cmp x27, #0x20\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q28, [x9, #0x30]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x9, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x9, #0x50]\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x9, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x9, #0x70]\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x9, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x9, #0x90]\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x9, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x9, #0xb0]\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x9, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x9, #0xd0]\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x9, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
       "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
       "ldr q2, [x24, #0x0]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
       "ldr q3, [x23, #0x0]\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
       "ldr q4, [x22, #0x0]\n"
       "ldr q7, [x9, #0x10]\n"
       "bge 115b\n"
@@ -2111,7 +2110,7 @@
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
+      "ldr q29, [x9, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       "add x22, x22, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2120,131 +2119,131 @@
       "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q28, [x9, #0x30]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x9, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x9, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x9, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x9, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x9, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x9, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x9, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x9, #0xf0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x9, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x9, #0x50]\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x9, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x9, #0x70]\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x9, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x9, #0x90]\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x9, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x9, #0xb0]\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x9, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x9, #0xd0]\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x9, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
       "117:"  // Height 5: Multiply loop: Main loop skip
       "cbz x27, 122f\n"
       "cmp x27, #0x4\n"
       "blt 119f\n"
       "118:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
       "ldr s1, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s31, [x23], #0x4\n"
+      "ldr s30, [x22], #0x4\n"
+      "ldr q29, [x9, #0x0]\n"
+      ".inst 0x4f82e3a8  // sdot v8.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      "ldr q28, [x9, #0x10]\n"
+      ".inst 0x4f80e3b0  // sdot v16.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b4  // sdot v20.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3b8  // sdot v24.4s, v29.16b, v30.4b[0]\n"
+      "ldr q29, [x9, #0x20]\n"
+      ".inst 0x4f82e389  // sdot v9.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e391  // sdot v17.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe395  // sdot v21.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee399  // sdot v25.4s, v28.16b, v30.4b[0]\n"
+      "ldr q28, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f82e3aa  // sdot v10.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b6  // sdot v22.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3ba  // sdot v26.4s, v29.16b, v30.4b[0]\n"
+      ".inst 0x4f82e38b  // sdot v11.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe397  // sdot v23.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee39b  // sdot v27.4s, v28.16b, v30.4b[0]\n"
       "bge 118b\n"
       "119:"  // Height 5: Multiply loop: Skip odd blocks
       "cbz x27, 122f\n"
@@ -2268,71 +2267,71 @@
       "ldr b3, [x23, #0x0]\n"
       "ldr b4, [x22, #0x0]\n"
       "121:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q29, [x9, #0x0]\n"
+      "ldr q28, [x9, #0x10]\n"
+      ".inst 0x4f80e3a8  // sdot v8.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b0  // sdot v16.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b4  // sdot v20.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3b8  // sdot v24.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x9, #0x20]\n"
+      ".inst 0x4f80e389  // sdot v9.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e391  // sdot v17.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e395  // sdot v21.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e399  // sdot v25.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
       "122:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 112b\n"
-      "ldr q0, [x14, #0x0]\n"
-      "ldr q1, [x14, #0x10]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q2, [x14, #0x20]\n"
-      "ldr q3, [x14, #0x30]\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "ldr q31, [x14, #0x0]\n"
+      "ldr q30, [x14, #0x10]\n"
+      "add v8.4s, v8.4s, v31.4s\n"
+      "add v9.4s, v9.4s, v30.4s\n"
+      "ldr q29, [x14, #0x20]\n"
+      "ldr q28, [x14, #0x30]\n"
+      "add v10.4s, v10.4s, v29.4s\n"
+      "add v11.4s, v11.4s, v28.4s\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x11, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x11, x20\n"
+      "add x24, x25, x20\n"
       "prfm pstl1keep, [x11, #0x0]\n"
+      "add x23, x24, x20\n"
       "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
-      "add v15.4s, v15.4s, v3.4s\n"
+      "add v12.4s, v12.4s, v31.4s\n"
+      "add v13.4s, v13.4s, v30.4s\n"
+      "add v14.4s, v14.4s, v29.4s\n"
+      "add v15.4s, v15.4s, v28.4s\n"
       "add x14, x14, #0x40\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v31.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v31.4s\n"
+      "add v21.4s, v21.4s, v30.4s\n"
+      "add v22.4s, v22.4s, v29.4s\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v31.4s\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
       "tbz %x[flags], #4, 123f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -2346,10 +2345,10 @@
       "add x13, x13, #0x40\n"
       "b 124f\n"
       "123:"  // Height 5: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -2378,79 +2377,79 @@
       "sqrdmulh v26.4s, v26.4s, v6.4s\n"
       "sqrdmulh v27.4s, v27.4s, v7.4s\n"
       "tbz %x[flags], #5, 125f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "and v6.16b, v22.16b, v2.16b\n"
-      "and v7.16b, v23.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v20.4s, v20.4s, v4.4s\n"
-      "sqadd v21.4s, v21.4s, v5.4s\n"
-      "sqadd v22.4s, v22.4s, v6.4s\n"
-      "sqadd v23.4s, v23.4s, v7.4s\n"
-      "and v4.16b, v24.16b, v0.16b\n"
-      "and v5.16b, v25.16b, v1.16b\n"
-      "and v6.16b, v26.16b, v2.16b\n"
-      "and v7.16b, v27.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v4.4s\n"
-      "sqadd v25.4s, v25.4s, v5.4s\n"
-      "sqadd v26.4s, v26.4s, v6.4s\n"
-      "sqadd v27.4s, v27.4s, v7.4s\n"
+      "and v31.16b, v8.16b, v0.16b\n"
+      "and v30.16b, v9.16b, v1.16b\n"
+      "and v29.16b, v10.16b, v2.16b\n"
+      "and v28.16b, v11.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v31.4s\n"
+      "sqadd v9.4s, v9.4s, v30.4s\n"
+      "sqadd v10.4s, v10.4s, v29.4s\n"
+      "sqadd v11.4s, v11.4s, v28.4s\n"
+      "and v31.16b, v12.16b, v0.16b\n"
+      "and v30.16b, v13.16b, v1.16b\n"
+      "and v29.16b, v14.16b, v2.16b\n"
+      "and v28.16b, v15.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v31.4s\n"
+      "sqadd v13.4s, v13.4s, v30.4s\n"
+      "sqadd v14.4s, v14.4s, v29.4s\n"
+      "sqadd v15.4s, v15.4s, v28.4s\n"
+      "and v31.16b, v16.16b, v0.16b\n"
+      "and v30.16b, v17.16b, v1.16b\n"
+      "and v29.16b, v18.16b, v2.16b\n"
+      "and v28.16b, v19.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v31.4s\n"
+      "sqadd v17.4s, v17.4s, v30.4s\n"
+      "sqadd v18.4s, v18.4s, v29.4s\n"
+      "sqadd v19.4s, v19.4s, v28.4s\n"
+      "and v31.16b, v20.16b, v0.16b\n"
+      "and v30.16b, v21.16b, v1.16b\n"
+      "and v29.16b, v22.16b, v2.16b\n"
+      "and v28.16b, v23.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v31.4s\n"
+      "sqadd v21.4s, v21.4s, v30.4s\n"
+      "sqadd v22.4s, v22.4s, v29.4s\n"
+      "sqadd v23.4s, v23.4s, v28.4s\n"
+      "and v31.16b, v24.16b, v0.16b\n"
+      "and v30.16b, v25.16b, v1.16b\n"
+      "and v29.16b, v26.16b, v2.16b\n"
+      "and v28.16b, v27.16b, v3.16b\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v31.4s\n"
+      "sqadd v25.4s, v25.4s, v30.4s\n"
+      "sqadd v26.4s, v26.4s, v29.4s\n"
+      "sqadd v27.4s, v27.4s, v28.4s\n"
       "125:"  // Height 5: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v30.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v29.4s }, [x20]\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "srshl v14.4s, v14.4s, v2.4s\n"
       "srshl v15.4s, v15.4s, v3.4s\n"
       "cmp x10, #0x10\n"
@@ -2466,194 +2465,194 @@
       "srshl v25.4s, v25.4s, v1.4s\n"
       "srshl v26.4s, v26.4s, v2.4s\n"
       "srshl v27.4s, v27.4s, v3.4s\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v8.4s, v8.4s, v30.4s\n"
+      "add v9.4s, v9.4s, v30.4s\n"
+      "add v10.4s, v10.4s, v30.4s\n"
+      "add v11.4s, v11.4s, v30.4s\n"
+      "add v12.4s, v12.4s, v30.4s\n"
+      "add v13.4s, v13.4s, v30.4s\n"
+      "add v14.4s, v14.4s, v30.4s\n"
+      "add v15.4s, v15.4s, v30.4s\n"
+      "add v16.4s, v16.4s, v30.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v30.4s\n"
+      "add v19.4s, v19.4s, v30.4s\n"
+      "add v20.4s, v20.4s, v30.4s\n"
+      "add v21.4s, v21.4s, v30.4s\n"
+      "add v22.4s, v22.4s, v30.4s\n"
+      "add v23.4s, v23.4s, v30.4s\n"
+      "add v24.4s, v24.4s, v30.4s\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v30.4s\n"
+      "add v27.4s, v27.4s, v30.4s\n"
+      "smin v8.4s, v8.4s, v29.4s\n"
+      "smin v9.4s, v9.4s, v29.4s\n"
+      "smin v10.4s, v10.4s, v29.4s\n"
+      "smin v11.4s, v11.4s, v29.4s\n"
+      "smin v12.4s, v12.4s, v29.4s\n"
+      "smin v13.4s, v13.4s, v29.4s\n"
+      "smin v14.4s, v14.4s, v29.4s\n"
+      "smin v15.4s, v15.4s, v29.4s\n"
+      "smin v16.4s, v16.4s, v29.4s\n"
+      "smin v17.4s, v17.4s, v29.4s\n"
+      "smin v18.4s, v18.4s, v29.4s\n"
+      "smin v19.4s, v19.4s, v29.4s\n"
+      "smin v20.4s, v20.4s, v29.4s\n"
+      "smin v21.4s, v21.4s, v29.4s\n"
+      "smin v22.4s, v22.4s, v29.4s\n"
+      "smin v23.4s, v23.4s, v29.4s\n"
+      "smin v24.4s, v24.4s, v29.4s\n"
+      "smin v25.4s, v25.4s, v29.4s\n"
+      "smin v26.4s, v26.4s, v29.4s\n"
+      "smin v27.4s, v27.4s, v29.4s\n"
+      "smax v8.4s, v8.4s, v28.4s\n"
+      "smax v9.4s, v9.4s, v28.4s\n"
+      "smax v10.4s, v10.4s, v28.4s\n"
+      "smax v11.4s, v11.4s, v28.4s\n"
+      "smax v12.4s, v12.4s, v28.4s\n"
+      "smax v13.4s, v13.4s, v28.4s\n"
+      "smax v14.4s, v14.4s, v28.4s\n"
+      "smax v15.4s, v15.4s, v28.4s\n"
+      "smax v16.4s, v16.4s, v28.4s\n"
+      "smax v17.4s, v17.4s, v28.4s\n"
+      "smax v18.4s, v18.4s, v28.4s\n"
+      "smax v19.4s, v19.4s, v28.4s\n"
+      "smax v20.4s, v20.4s, v28.4s\n"
+      "smax v21.4s, v21.4s, v28.4s\n"
+      "smax v22.4s, v22.4s, v28.4s\n"
+      "smax v23.4s, v23.4s, v28.4s\n"
+      "smax v24.4s, v24.4s, v28.4s\n"
+      "smax v25.4s, v25.4s, v28.4s\n"
+      "smax v26.4s, v26.4s, v28.4s\n"
+      "smax v27.4s, v27.4s, v28.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v29.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v28.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v18.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "uzp1 v8.16b, v8.16b, v29.16b\n"
+      "uzp1 v12.16b, v12.16b, v28.16b\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v20.16b, v20.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 134f\n"
       "tbz x10, #3, 129f\n"
       "str d8, [x11], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
       "tbz x10, #2, 127f\n"
       "st1 { v8.s }[2], [x11], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
-      "st1 { v16.s }[2], [x23], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
       "tbz x10, #1, 126f\n"
       "st1 { v8.h }[6], [x11], #0x2\n"
-      "st1 { v12.h }[6], [x24], #0x2\n"
-      "st1 { v16.h }[6], [x23], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
       "tbz x10, #0, 133f\n"
       "st1 { v8.b }[14], [x11]\n"
-      "st1 { v12.b }[14], [x24]\n"
-      "st1 { v16.b }[14], [x23]\n"
-      "st1 { v20.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 133f\n"
       "126:"  // Height 5: Partial direct writeback: partial_1_12
       "tbz x10, #0, 133f\n"
       "st1 { v8.b }[12], [x11]\n"
-      "st1 { v12.b }[12], [x24]\n"
-      "st1 { v16.b }[12], [x23]\n"
-      "st1 { v20.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 133f\n"
       "127:"  // Height 5: Partial direct writeback: partial_2_8
       "tbz x10, #1, 128f\n"
       "st1 { v8.h }[4], [x11], #0x2\n"
-      "st1 { v12.h }[4], [x24], #0x2\n"
-      "st1 { v16.h }[4], [x23], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
       "tbz x10, #0, 133f\n"
       "st1 { v8.b }[10], [x11]\n"
-      "st1 { v12.b }[10], [x24]\n"
-      "st1 { v16.b }[10], [x23]\n"
-      "st1 { v20.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 133f\n"
       "128:"  // Height 5: Partial direct writeback: partial_1_8
       "tbz x10, #0, 133f\n"
       "st1 { v8.b }[8], [x11]\n"
-      "st1 { v12.b }[8], [x24]\n"
-      "st1 { v16.b }[8], [x23]\n"
-      "st1 { v20.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 133f\n"
       "129:"  // Height 5: Partial direct writeback: partial_4_0
       "tbz x10, #2, 131f\n"
       "str s8, [x11], #0x4\n"
-      "str s12, [x24], #0x4\n"
-      "str s16, [x23], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
       "tbz x10, #1, 130f\n"
       "st1 { v8.h }[2], [x11], #0x2\n"
-      "st1 { v12.h }[2], [x24], #0x2\n"
-      "st1 { v16.h }[2], [x23], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
       "tbz x10, #0, 133f\n"
       "st1 { v8.b }[6], [x11]\n"
-      "st1 { v12.b }[6], [x24]\n"
-      "st1 { v16.b }[6], [x23]\n"
-      "st1 { v20.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 133f\n"
       "130:"  // Height 5: Partial direct writeback: partial_1_4
       "tbz x10, #0, 133f\n"
       "st1 { v8.b }[4], [x11]\n"
-      "st1 { v12.b }[4], [x24]\n"
-      "st1 { v16.b }[4], [x23]\n"
-      "st1 { v20.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 133f\n"
       "131:"  // Height 5: Partial direct writeback: partial_2_0
       "tbz x10, #1, 132f\n"
       "str h8, [x11], #0x2\n"
-      "str h12, [x24], #0x2\n"
-      "str h16, [x23], #0x2\n"
-      "str h20, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
       "tbz x10, #0, 133f\n"
       "st1 { v8.b }[2], [x11]\n"
-      "st1 { v12.b }[2], [x24]\n"
-      "st1 { v16.b }[2], [x23]\n"
-      "st1 { v20.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 133f\n"
       "132:"  // Height 5: Partial direct writeback: partial_1_0
       "str b8, [x11, #0x0]\n"
-      "str b12, [x24, #0x0]\n"
-      "str b16, [x23, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "133:"  // Height 5: Partial direct writeback: Done
       "b 135f\n"
       "134:"  // Height 5: Full writeback
       "str q8, [x11, #0x0]\n"
       "add x11, x11, #0x10\n"
-      "str q12, [x24, #0x0]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q20, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "135:"  // Height 5: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 110b\n"
@@ -2698,16 +2697,16 @@
       "139:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 140f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 141f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -2719,11 +2718,11 @@
       "b 141f\n"
       "140:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "141:"  // Height 6: input setup done
       "cmp x27, #0x10\n"
       "blt 144f\n"
@@ -3002,43 +3001,43 @@
       "cmp x27, #0x4\n"
       "blt 146f\n"
       "145:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s7, [x26], #0x4\n"
+      "ldr s6, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr s5, [x21], #0x4\n"
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s3, [x22], #0x4\n"
+      "ldr s2, [x21], #0x4\n"
+      "ldr q1, [x9, #0x0]\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x4f87e028  // sdot v8.4s, v1.16b, v7.4b[0]\n"
+      ".inst 0x4f86e02c  // sdot v12.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e030  // sdot v16.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e034  // sdot v20.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e038  // sdot v24.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03c  // sdot v28.4s, v1.16b, v2.4b[0]\n"
+      "ldr q1, [x9, #0x20]\n"
+      ".inst 0x4f87e009  // sdot v9.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00d  // sdot v13.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e011  // sdot v17.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e015  // sdot v21.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e019  // sdot v25.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01d  // sdot v29.4s, v0.16b, v2.4b[0]\n"
+      "ldr q0, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f87e02a  // sdot v10.4s, v1.16b, v7.4b[0]\n"
+      ".inst 0x4f86e02e  // sdot v14.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e032  // sdot v18.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e036  // sdot v22.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e03a  // sdot v26.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03e  // sdot v30.4s, v1.16b, v2.4b[0]\n"
+      ".inst 0x4f87e00b  // sdot v11.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00f  // sdot v15.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e013  // sdot v19.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e017  // sdot v23.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e01b  // sdot v27.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01f  // sdot v31.4s, v0.16b, v2.4b[0]\n"
       "bge 145b\n"
       "146:"  // Height 6: Multiply loop: Skip odd blocks
       "cbz x27, 149f\n"
@@ -3065,81 +3064,81 @@
       "ldr b4, [x22, #0x0]\n"
       "ldr b5, [x21, #0x0]\n"
       "148:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x9, #0x30]\n"
+      "ldr q7, [x9, #0x0]\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x4f80e0e8  // sdot v8.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ec  // sdot v12.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f0  // sdot v16.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f4  // sdot v20.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f8  // sdot v24.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fc  // sdot v28.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4f80e0c9  // sdot v9.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cd  // sdot v13.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d1  // sdot v17.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d5  // sdot v21.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d9  // sdot v25.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dd  // sdot v29.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x9, #0x30]\n"
       "add x9, x9, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0ea  // sdot v10.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ee  // sdot v14.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f2  // sdot v18.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f6  // sdot v22.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fa  // sdot v26.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fe  // sdot v30.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0cb  // sdot v11.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cf  // sdot v15.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d3  // sdot v19.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d7  // sdot v23.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0db  // sdot v27.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0df  // sdot v31.4s, v6.16b, v5.4b[0]\n"
       "149:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 139b\n"
-      "ldr q0, [x14, #0x0]\n"
-      "ldr q1, [x14, #0x10]\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "ldr q2, [x14, #0x20]\n"
-      "ldr q3, [x14, #0x30]\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "ldr q3, [x14, #0x0]\n"
+      "ldr q2, [x14, #0x10]\n"
+      "add v8.4s, v8.4s, v3.4s\n"
+      "add v9.4s, v9.4s, v2.4s\n"
+      "ldr q1, [x14, #0x20]\n"
+      "ldr q0, [x14, #0x30]\n"
+      "add v10.4s, v10.4s, v1.4s\n"
+      "add v11.4s, v11.4s, v0.4s\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x11, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x11, x20\n"
+      "add x24, x25, x20\n"
       "prfm pstl1keep, [x11, #0x0]\n"
+      "add x23, x24, x20\n"
       "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
+      "add x21, x22, x20\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "add x20, x21, x20\n"
       "prfm pstl1keep, [x22, #0x0]\n"
+      "add v12.4s, v12.4s, v3.4s\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "add v12.4s, v12.4s, v0.4s\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
-      "add v13.4s, v13.4s, v1.4s\n"
-      "add v14.4s, v14.4s, v2.4s\n"
+      "add v13.4s, v13.4s, v2.4s\n"
+      "add v14.4s, v14.4s, v1.4s\n"
       "add x14, x14, #0x40\n"
-      "add v15.4s, v15.4s, v3.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "add v28.4s, v28.4s, v0.4s\n"
-      "add v29.4s, v29.4s, v1.4s\n"
-      "add v30.4s, v30.4s, v2.4s\n"
-      "add v31.4s, v31.4s, v3.4s\n"
+      "add v15.4s, v15.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v2.4s\n"
+      "add v18.4s, v18.4s, v1.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v1.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v2.4s\n"
+      "add v26.4s, v26.4s, v1.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v2.4s\n"
+      "add v30.4s, v30.4s, v1.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
       "tbz %x[flags], #4, 150f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -3153,10 +3152,10 @@
       "add x13, x13, #0x40\n"
       "b 151f\n"
       "150:"  // Height 6: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -3189,91 +3188,91 @@
       "sqrdmulh v30.4s, v30.4s, v6.4s\n"
       "sqrdmulh v31.4s, v31.4s, v7.4s\n"
       "tbz %x[flags], #5, 152f\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v7.16b, v8.16b, v0.16b\n"
+      "and v6.16b, v9.16b, v1.16b\n"
+      "and v5.16b, v10.16b, v2.16b\n"
+      "and v4.16b, v11.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v12.16b, v0.16b\n"
-      "and v5.16b, v13.16b, v1.16b\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v12.4s, v12.4s, v4.4s\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
-      "sqadd v14.4s, v14.4s, v6.4s\n"
-      "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "and v6.16b, v22.16b, v2.16b\n"
-      "and v7.16b, v23.16b, v3.16b\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v7.4s\n"
+      "sqadd v9.4s, v9.4s, v6.4s\n"
+      "sqadd v10.4s, v10.4s, v5.4s\n"
+      "sqadd v11.4s, v11.4s, v4.4s\n"
+      "and v7.16b, v12.16b, v0.16b\n"
+      "and v6.16b, v13.16b, v1.16b\n"
+      "and v5.16b, v14.16b, v2.16b\n"
+      "and v4.16b, v15.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v20.4s, v20.4s, v4.4s\n"
-      "sqadd v21.4s, v21.4s, v5.4s\n"
-      "sqadd v22.4s, v22.4s, v6.4s\n"
-      "sqadd v23.4s, v23.4s, v7.4s\n"
-      "and v4.16b, v24.16b, v0.16b\n"
-      "and v5.16b, v25.16b, v1.16b\n"
-      "and v6.16b, v26.16b, v2.16b\n"
-      "and v7.16b, v27.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v7.4s\n"
+      "sqadd v13.4s, v13.4s, v6.4s\n"
+      "sqadd v14.4s, v14.4s, v5.4s\n"
+      "sqadd v15.4s, v15.4s, v4.4s\n"
+      "and v7.16b, v16.16b, v0.16b\n"
+      "and v6.16b, v17.16b, v1.16b\n"
+      "and v5.16b, v18.16b, v2.16b\n"
+      "and v4.16b, v19.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v4.4s\n"
-      "sqadd v25.4s, v25.4s, v5.4s\n"
-      "sqadd v26.4s, v26.4s, v6.4s\n"
-      "sqadd v27.4s, v27.4s, v7.4s\n"
-      "and v4.16b, v28.16b, v0.16b\n"
-      "and v5.16b, v29.16b, v1.16b\n"
-      "and v6.16b, v30.16b, v2.16b\n"
-      "and v7.16b, v31.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v7.4s\n"
+      "sqadd v17.4s, v17.4s, v6.4s\n"
+      "sqadd v18.4s, v18.4s, v5.4s\n"
+      "sqadd v19.4s, v19.4s, v4.4s\n"
+      "and v7.16b, v20.16b, v0.16b\n"
+      "and v6.16b, v21.16b, v1.16b\n"
+      "and v5.16b, v22.16b, v2.16b\n"
+      "and v4.16b, v23.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v28.4s, v28.4s, v4.4s\n"
-      "sqadd v29.4s, v29.4s, v5.4s\n"
-      "sqadd v30.4s, v30.4s, v6.4s\n"
-      "sqadd v31.4s, v31.4s, v7.4s\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v7.4s\n"
+      "sqadd v21.4s, v21.4s, v6.4s\n"
+      "sqadd v22.4s, v22.4s, v5.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v7.16b, v24.16b, v0.16b\n"
+      "and v6.16b, v25.16b, v1.16b\n"
+      "and v5.16b, v26.16b, v2.16b\n"
+      "and v4.16b, v27.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v7.4s\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sqadd v26.4s, v26.4s, v5.4s\n"
+      "sqadd v27.4s, v27.4s, v4.4s\n"
+      "and v7.16b, v28.16b, v0.16b\n"
+      "and v6.16b, v29.16b, v1.16b\n"
+      "and v5.16b, v30.16b, v2.16b\n"
+      "and v4.16b, v31.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v7.4s\n"
+      "sqadd v29.4s, v29.4s, v6.4s\n"
+      "sqadd v30.4s, v30.4s, v5.4s\n"
+      "sqadd v31.4s, v31.4s, v4.4s\n"
       "152:"  // Height 6: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v6.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v5.4s }, [x20]\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "srshl v14.4s, v14.4s, v2.4s\n"
       "srshl v15.4s, v15.4s, v3.4s\n"
       "cmp x10, #0x10\n"
@@ -3293,225 +3292,225 @@
       "srshl v29.4s, v29.4s, v1.4s\n"
       "srshl v30.4s, v30.4s, v2.4s\n"
       "srshl v31.4s, v31.4s, v3.4s\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
+      "add v8.4s, v8.4s, v6.4s\n"
+      "add v9.4s, v9.4s, v6.4s\n"
+      "add v10.4s, v10.4s, v6.4s\n"
+      "add v11.4s, v11.4s, v6.4s\n"
+      "add v12.4s, v12.4s, v6.4s\n"
+      "add v13.4s, v13.4s, v6.4s\n"
+      "add v14.4s, v14.4s, v6.4s\n"
+      "add v15.4s, v15.4s, v6.4s\n"
+      "add v16.4s, v16.4s, v6.4s\n"
+      "add v17.4s, v17.4s, v6.4s\n"
+      "add v18.4s, v18.4s, v6.4s\n"
+      "add v19.4s, v19.4s, v6.4s\n"
+      "add v20.4s, v20.4s, v6.4s\n"
+      "add v21.4s, v21.4s, v6.4s\n"
+      "add v22.4s, v22.4s, v6.4s\n"
+      "add v23.4s, v23.4s, v6.4s\n"
+      "add v24.4s, v24.4s, v6.4s\n"
+      "add v25.4s, v25.4s, v6.4s\n"
+      "add v26.4s, v26.4s, v6.4s\n"
+      "add v27.4s, v27.4s, v6.4s\n"
+      "add v28.4s, v28.4s, v6.4s\n"
+      "add v29.4s, v29.4s, v6.4s\n"
+      "add v30.4s, v30.4s, v6.4s\n"
+      "add v31.4s, v31.4s, v6.4s\n"
+      "smin v8.4s, v8.4s, v5.4s\n"
+      "smin v9.4s, v9.4s, v5.4s\n"
+      "smin v10.4s, v10.4s, v5.4s\n"
+      "smin v11.4s, v11.4s, v5.4s\n"
+      "smin v12.4s, v12.4s, v5.4s\n"
+      "smin v13.4s, v13.4s, v5.4s\n"
+      "smin v14.4s, v14.4s, v5.4s\n"
+      "smin v15.4s, v15.4s, v5.4s\n"
+      "smin v16.4s, v16.4s, v5.4s\n"
+      "smin v17.4s, v17.4s, v5.4s\n"
+      "smin v18.4s, v18.4s, v5.4s\n"
+      "smin v19.4s, v19.4s, v5.4s\n"
+      "smin v20.4s, v20.4s, v5.4s\n"
+      "smin v21.4s, v21.4s, v5.4s\n"
+      "smin v22.4s, v22.4s, v5.4s\n"
+      "smin v23.4s, v23.4s, v5.4s\n"
+      "smin v24.4s, v24.4s, v5.4s\n"
+      "smin v25.4s, v25.4s, v5.4s\n"
+      "smin v26.4s, v26.4s, v5.4s\n"
+      "smin v27.4s, v27.4s, v5.4s\n"
+      "smin v28.4s, v28.4s, v5.4s\n"
+      "smin v29.4s, v29.4s, v5.4s\n"
+      "smin v30.4s, v30.4s, v5.4s\n"
+      "smin v31.4s, v31.4s, v5.4s\n"
+      "smax v8.4s, v8.4s, v4.4s\n"
+      "smax v9.4s, v9.4s, v4.4s\n"
+      "smax v10.4s, v10.4s, v4.4s\n"
+      "smax v11.4s, v11.4s, v4.4s\n"
+      "smax v12.4s, v12.4s, v4.4s\n"
+      "smax v13.4s, v13.4s, v4.4s\n"
+      "smax v14.4s, v14.4s, v4.4s\n"
+      "smax v15.4s, v15.4s, v4.4s\n"
+      "smax v16.4s, v16.4s, v4.4s\n"
+      "smax v17.4s, v17.4s, v4.4s\n"
+      "smax v18.4s, v18.4s, v4.4s\n"
+      "smax v19.4s, v19.4s, v4.4s\n"
+      "smax v20.4s, v20.4s, v4.4s\n"
+      "smax v21.4s, v21.4s, v4.4s\n"
+      "smax v22.4s, v22.4s, v4.4s\n"
+      "smax v23.4s, v23.4s, v4.4s\n"
+      "smax v24.4s, v24.4s, v4.4s\n"
+      "smax v25.4s, v25.4s, v4.4s\n"
+      "smax v26.4s, v26.4s, v4.4s\n"
+      "smax v27.4s, v27.4s, v4.4s\n"
+      "smax v28.4s, v28.4s, v4.4s\n"
+      "smax v29.4s, v29.4s, v4.4s\n"
+      "smax v30.4s, v30.4s, v4.4s\n"
+      "smax v31.4s, v31.4s, v4.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v2.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v1.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v0.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v19.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v18.8h, v26.8h, v27.8h\n"
       "uzp1 v28.8h, v28.8h, v29.8h\n"
-      "uzp1 v29.8h, v30.8h, v31.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
-      "uzp1 v12.16b, v12.16b, v13.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
-      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "uzp1 v17.8h, v30.8h, v31.8h\n"
+      "uzp1 v8.16b, v8.16b, v2.16b\n"
+      "uzp1 v12.16b, v12.16b, v1.16b\n"
+      "uzp1 v16.16b, v16.16b, v0.16b\n"
+      "uzp1 v20.16b, v20.16b, v19.16b\n"
+      "uzp1 v24.16b, v24.16b, v18.16b\n"
+      "uzp1 v28.16b, v28.16b, v17.16b\n"
       "bge 161f\n"
       "tbz x10, #3, 156f\n"
       "str d8, [x11], #0x8\n"
-      "str d12, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
-      "str d28, [x20], #0x8\n"
+      "str d12, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
       "tbz x10, #2, 154f\n"
       "st1 { v8.s }[2], [x11], #0x4\n"
-      "st1 { v12.s }[2], [x24], #0x4\n"
-      "st1 { v16.s }[2], [x23], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
-      "st1 { v28.s }[2], [x20], #0x4\n"
+      "st1 { v12.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
       "tbz x10, #1, 153f\n"
       "st1 { v8.h }[6], [x11], #0x2\n"
-      "st1 { v12.h }[6], [x24], #0x2\n"
-      "st1 { v16.h }[6], [x23], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
-      "st1 { v28.h }[6], [x20], #0x2\n"
+      "st1 { v12.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
       "tbz x10, #0, 160f\n"
       "st1 { v8.b }[14], [x11]\n"
-      "st1 { v12.b }[14], [x24]\n"
-      "st1 { v16.b }[14], [x23]\n"
-      "st1 { v20.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
-      "st1 { v28.b }[14], [x20]\n"
+      "st1 { v12.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "st1 { v28.b }[14], [x21]\n"
       "b 160f\n"
       "153:"  // Height 6: Partial direct writeback: partial_1_12
       "tbz x10, #0, 160f\n"
       "st1 { v8.b }[12], [x11]\n"
-      "st1 { v12.b }[12], [x24]\n"
-      "st1 { v16.b }[12], [x23]\n"
-      "st1 { v20.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
-      "st1 { v28.b }[12], [x20]\n"
+      "st1 { v12.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "st1 { v28.b }[12], [x21]\n"
       "b 160f\n"
       "154:"  // Height 6: Partial direct writeback: partial_2_8
       "tbz x10, #1, 155f\n"
       "st1 { v8.h }[4], [x11], #0x2\n"
-      "st1 { v12.h }[4], [x24], #0x2\n"
-      "st1 { v16.h }[4], [x23], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
-      "st1 { v28.h }[4], [x20], #0x2\n"
+      "st1 { v12.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
       "tbz x10, #0, 160f\n"
       "st1 { v8.b }[10], [x11]\n"
-      "st1 { v12.b }[10], [x24]\n"
-      "st1 { v16.b }[10], [x23]\n"
-      "st1 { v20.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
-      "st1 { v28.b }[10], [x20]\n"
+      "st1 { v12.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "st1 { v28.b }[10], [x21]\n"
       "b 160f\n"
       "155:"  // Height 6: Partial direct writeback: partial_1_8
       "tbz x10, #0, 160f\n"
       "st1 { v8.b }[8], [x11]\n"
-      "st1 { v12.b }[8], [x24]\n"
-      "st1 { v16.b }[8], [x23]\n"
-      "st1 { v20.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
-      "st1 { v28.b }[8], [x20]\n"
+      "st1 { v12.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "st1 { v28.b }[8], [x21]\n"
       "b 160f\n"
       "156:"  // Height 6: Partial direct writeback: partial_4_0
       "tbz x10, #2, 158f\n"
       "str s8, [x11], #0x4\n"
-      "str s12, [x24], #0x4\n"
-      "str s16, [x23], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
-      "str s28, [x20], #0x4\n"
+      "str s12, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "str s28, [x21], #0x4\n"
       "tbz x10, #1, 157f\n"
       "st1 { v8.h }[2], [x11], #0x2\n"
-      "st1 { v12.h }[2], [x24], #0x2\n"
-      "st1 { v16.h }[2], [x23], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
-      "st1 { v28.h }[2], [x20], #0x2\n"
+      "st1 { v12.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
       "tbz x10, #0, 160f\n"
       "st1 { v8.b }[6], [x11]\n"
-      "st1 { v12.b }[6], [x24]\n"
-      "st1 { v16.b }[6], [x23]\n"
-      "st1 { v20.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
-      "st1 { v28.b }[6], [x20]\n"
+      "st1 { v12.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "st1 { v28.b }[6], [x21]\n"
       "b 160f\n"
       "157:"  // Height 6: Partial direct writeback: partial_1_4
       "tbz x10, #0, 160f\n"
       "st1 { v8.b }[4], [x11]\n"
-      "st1 { v12.b }[4], [x24]\n"
-      "st1 { v16.b }[4], [x23]\n"
-      "st1 { v20.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
-      "st1 { v28.b }[4], [x20]\n"
+      "st1 { v12.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "st1 { v28.b }[4], [x21]\n"
       "b 160f\n"
       "158:"  // Height 6: Partial direct writeback: partial_2_0
       "tbz x10, #1, 159f\n"
       "str h8, [x11], #0x2\n"
-      "str h12, [x24], #0x2\n"
-      "str h16, [x23], #0x2\n"
-      "str h20, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
-      "str h28, [x20], #0x2\n"
+      "str h12, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "str h28, [x21], #0x2\n"
       "tbz x10, #0, 160f\n"
       "st1 { v8.b }[2], [x11]\n"
-      "st1 { v12.b }[2], [x24]\n"
-      "st1 { v16.b }[2], [x23]\n"
-      "st1 { v20.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
-      "st1 { v28.b }[2], [x20]\n"
+      "st1 { v12.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "st1 { v28.b }[2], [x21]\n"
       "b 160f\n"
       "159:"  // Height 6: Partial direct writeback: partial_1_0
       "str b8, [x11, #0x0]\n"
-      "str b12, [x24, #0x0]\n"
-      "str b16, [x23, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
-      "str b28, [x20, #0x0]\n"
+      "str b12, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
       "160:"  // Height 6: Partial direct writeback: Done
       "b 162f\n"
       "161:"  // Height 6: Full writeback
       "str q8, [x11, #0x0]\n"
       "add x11, x11, #0x10\n"
-      "str q12, [x24, #0x0]\n"
-      "str q16, [x23, #0x0]\n"
-      "str q20, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
-      "str q28, [x20, #0x0]\n"
+      "str q12, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
       "162:"  // Height 6: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 137b\n"
@@ -3527,7 +3526,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "164:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp
index 7eacdce..d0d5f1b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -98,5 +98,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp
index fc52553..0771829 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp
@@ -85,7 +85,6 @@
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 146f\n"
@@ -115,11 +114,11 @@
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -135,41 +134,41 @@
       "ldr q6, [x9, #0x10]\n"
       "blt 8f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x70]\n"
-      "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x90]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x9, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x9, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x9, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x9, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x9, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x9, #0xf0]\n"
+      "trn1 v18.2d, v1.2d, v21.2d\n"
+      ".inst 0x4e87a648  // smmla v8.4s, v18.16b, v7.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4e86a64c  // smmla v12.4s, v18.16b, v6.16b\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v21.2d\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4e91a428  // smmla v8.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4e90a42c  // smmla v12.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4e91a429  // smmla v9.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4e90a42d  // smmla v13.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4e91a42a  // smmla v10.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4e90a42e  // smmla v14.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xf0]\n"
       "sub x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
       "cmp x27, #0x20\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e91a42b  // smmla v11.4s, v1.16b, v17.16b\n"
+      ".inst 0x4e90a42f  // smmla v15.4s, v1.16b, v16.16b\n"
       "ldr q1, [x26, #0x0]\n"
       "add x9, x9, #0x100\n"
       "ldr q7, [x9, #0x0]\n"
@@ -177,40 +176,40 @@
       "prfm pldl1keep, [x26, #0x80]\n"
       "bge 7b\n"
       "8:"  // Height 1: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x70]\n"
-      "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x90]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x9, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x9, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x9, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x9, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x9, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x9, #0xf0]\n"
+      "trn1 v18.2d, v1.2d, v19.2d\n"
+      ".inst 0x4e87a648  // smmla v8.4s, v18.16b, v7.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4e86a64c  // smmla v12.4s, v18.16b, v6.16b\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v19.2d\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4e91a428  // smmla v8.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4e90a42c  // smmla v12.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4e91a429  // smmla v9.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4e90a42d  // smmla v13.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4e91a42a  // smmla v10.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4e90a42e  // smmla v14.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xf0]\n"
       "add x26, x26, #0x10\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e91a42b  // smmla v11.4s, v1.16b, v17.16b\n"
+      ".inst 0x4e90a42f  // smmla v15.4s, v1.16b, v16.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "add x9, x9, #0x100\n"
       "9:"  // Height 1: Multiply loop: Main loop skip
@@ -218,26 +217,26 @@
       "cmp x27, #0x8\n"
       "blt 11f\n"
       "10:"  // Height 1: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr q6, [x9, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x70]\n"
+      "ldr d18, [x26], #0x8\n"
+      "ldr q17, [x9, #0x0]\n"
+      "trn1 v18.2d, v18.2d, v16.2d\n"
+      "ldr q31, [x9, #0x10]\n"
+      ".inst 0x4e91a648  // smmla v8.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4e9fa64c  // smmla v12.4s, v18.16b, v31.16b\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x70]\n"
       "sub x27, x27, #0x8\n"
       "cmp x27, #0x8\n"
-      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
       "add x9, x9, #0x80\n"
       "bge 10b\n"
       "11:"  // Height 1: Multiply loop: Skip odd blocks
@@ -262,44 +261,44 @@
       "14:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr b1, [x26, #0x0]\n"
       "15:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x9, #0x0]\n"
-      "ldr q6, [x9, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "ldr q17, [x9, #0x0]\n"
+      "ldr q19, [x9, #0x10]\n"
+      "trn1 v18.2d, v1.2d, v16.2d\n"
+      ".inst 0x4e91a648  // smmla v8.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4e93a64c  // smmla v12.4s, v18.16b, v19.16b\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x70]\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
       "add x9, x9, #0x80\n"
       "16:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 4b\n"
-      "ldr q0, [x14, #0x0]\n"
-      "ldr q1, [x14, #0x10]\n"
+      "ldr q19, [x14, #0x0]\n"
+      "ldr q18, [x14, #0x10]\n"
       "uzp1 v8.2d, v8.2d, v12.2d\n"
       "uzp1 v9.2d, v9.2d, v13.2d\n"
-      "ldr q2, [x14, #0x20]\n"
-      "ldr q3, [x14, #0x30]\n"
+      "ldr q17, [x14, #0x20]\n"
+      "ldr q16, [x14, #0x30]\n"
       "uzp1 v10.2d, v10.2d, v14.2d\n"
       "uzp1 v11.2d, v11.2d, v15.2d\n"
       "mov v15.16b, v8.16b\n"
       "prfm pstl1keep, [x11, #0x0]\n"
-      "add v15.4s, v15.4s, v0.4s\n"
+      "add v15.4s, v15.4s, v19.4s\n"
       "add x14, x14, #0x40\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "add v10.4s, v10.4s, v17.4s\n"
+      "add v11.4s, v11.4s, v16.4s\n"
       "tbz %x[flags], #4, 17f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -313,10 +312,10 @@
       "add x13, x13, #0x40\n"
       "b 18f\n"
       "17:"  // Height 1: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -329,45 +328,45 @@
       "sqrdmulh v10.4s, v10.4s, v6.4s\n"
       "sqrdmulh v11.4s, v11.4s, v7.4s\n"
       "tbz %x[flags], #5, 19f\n"
-      "and v4.16b, v15.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v15.4s, v15.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v17.16b, v15.16b, v0.16b\n"
+      "and v16.16b, v9.16b, v1.16b\n"
+      "and v25.16b, v10.16b, v2.16b\n"
+      "and v18.16b, v11.16b, v3.16b\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v17.4s\n"
+      "sqadd v9.4s, v9.4s, v16.4s\n"
+      "sqadd v10.4s, v10.4s, v25.4s\n"
+      "sqadd v11.4s, v11.4s, v18.4s\n"
       "19:"  // Height 1: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v18.4s }, [x20]\n"
       "srshl v15.4s, v15.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v17.4s }, [x20]\n"
+      "add v15.4s, v15.4s, v18.4s\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "add v10.4s, v10.4s, v18.4s\n"
+      "add v11.4s, v11.4s, v18.4s\n"
       "cmp x10, #0x10\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
+      "smin v15.4s, v15.4s, v17.4s\n"
+      "smin v9.4s, v9.4s, v17.4s\n"
+      "smin v10.4s, v10.4s, v17.4s\n"
+      "smin v11.4s, v11.4s, v17.4s\n"
+      "smax v15.4s, v15.4s, v16.4s\n"
+      "smax v9.4s, v9.4s, v16.4s\n"
+      "smax v10.4s, v10.4s, v16.4s\n"
+      "smax v11.4s, v11.4s, v16.4s\n"
       "uzp1 v15.8h, v15.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
-      "uzp1 v15.16b, v15.16b, v9.16b\n"
+      "uzp1 v16.8h, v10.8h, v11.8h\n"
+      "uzp1 v15.16b, v15.16b, v16.16b\n"
       "bge 28f\n"
       "tbz x10, #3, 23f\n"
       "str d15, [x11], #0x8\n"
@@ -442,12 +441,12 @@
       "33:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 34f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 35f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -455,7 +454,7 @@
       "b 35f\n"
       "34:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
+      "add x25, x26, x21\n"
       "35:"  // Height 2: input setup done
       "cmp x27, #0x10\n"
       "blt 38f\n"
@@ -466,85 +465,85 @@
       "ldr q6, [x9, #0x10]\n"
       "blt 37f\n"
       "36:"  // Height 2: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x70]\n"
+      "trn1 v18.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a648  // smmla v8.4s, v18.16b, v7.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4e86a64c  // smmla v12.4s, v18.16b, v6.16b\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x70]\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x90]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x9, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x9, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x9, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x9, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x9, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x9, #0xf0]\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4e91a428  // smmla v8.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4e90a42c  // smmla v12.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4e91a429  // smmla v9.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4e90a42d  // smmla v13.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4e91a42a  // smmla v10.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4e90a42e  // smmla v14.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xf0]\n"
       "sub x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "ldr q2, [x25, #0x0]\n"
       "cmp x27, #0x20\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e91a42b  // smmla v11.4s, v1.16b, v17.16b\n"
       "add x9, x9, #0x100\n"
       "ldr q7, [x9, #0x0]\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e90a42f  // smmla v15.4s, v1.16b, v16.16b\n"
       "ldr q1, [x26, #0x0]\n"
       "ldr q6, [x9, #0x10]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "bge 36b\n"
       "37:"  // Height 2: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x70]\n"
+      "trn1 v18.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a648  // smmla v8.4s, v18.16b, v7.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4e86a64c  // smmla v12.4s, v18.16b, v6.16b\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x70]\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x90]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x9, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x9, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x9, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x9, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x9, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x9, #0xf0]\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x80]\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x90]\n"
+      ".inst 0x4e91a428  // smmla v8.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xa0]\n"
+      ".inst 0x4e90a42c  // smmla v12.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xb0]\n"
+      ".inst 0x4e91a429  // smmla v9.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xc0]\n"
+      ".inst 0x4e90a42d  // smmla v13.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xd0]\n"
+      ".inst 0x4e91a42a  // smmla v10.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x9, #0xe0]\n"
+      ".inst 0x4e90a42e  // smmla v14.4s, v1.16b, v16.16b\n"
+      "ldr q16, [x9, #0xf0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e91a42b  // smmla v11.4s, v1.16b, v17.16b\n"
+      ".inst 0x4e90a42f  // smmla v15.4s, v1.16b, v16.16b\n"
       "sub x27, x27, #0x10\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
@@ -554,27 +553,27 @@
       "cmp x27, #0x8\n"
       "blt 40f\n"
       "39:"  // Height 2: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d17, [x26], #0x8\n"
+      "ldr d16, [x25], #0x8\n"
+      "trn1 v18.2d, v17.2d, v16.2d\n"
       "sub x27, x27, #0x8\n"
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
-      "ldr q6, [x9, #0x20]\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
-      "ldr q6, [x9, #0x40]\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
-      "ldr q6, [x9, #0x60]\n"
-      "ldr q7, [x9, #0x70]\n"
+      "ldr q17, [x9, #0x0]\n"
+      "ldr q16, [x9, #0x10]\n"
+      ".inst 0x4e91a648  // smmla v8.4s, v18.16b, v17.16b\n"
+      ".inst 0x4e90a64c  // smmla v12.4s, v18.16b, v16.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      "ldr q16, [x9, #0x70]\n"
       "cmp x27, #0x8\n"
-      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
       "add x9, x9, #0x80\n"
       "bge 39b\n"
       "40:"  // Height 2: Multiply loop: Skip odd blocks
@@ -606,55 +605,55 @@
       "ldr b1, [x26, #0x0]\n"
       "ldr b2, [x25, #0x0]\n"
       "44:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x9, #0x0]\n"
-      "ldr q6, [x9, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x9, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x9, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "ldr q17, [x9, #0x0]\n"
+      "ldr q16, [x9, #0x10]\n"
+      "trn1 v18.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e91a648  // smmla v8.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x20]\n"
+      ".inst 0x4e90a64c  // smmla v12.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x30]\n"
+      ".inst 0x4e91a649  // smmla v9.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x40]\n"
+      ".inst 0x4e90a64d  // smmla v13.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x50]\n"
+      ".inst 0x4e91a64a  // smmla v10.4s, v18.16b, v17.16b\n"
+      "ldr q17, [x9, #0x60]\n"
+      ".inst 0x4e90a64e  // smmla v14.4s, v18.16b, v16.16b\n"
+      "ldr q16, [x9, #0x70]\n"
+      ".inst 0x4e91a64b  // smmla v11.4s, v18.16b, v17.16b\n"
+      ".inst 0x4e90a64f  // smmla v15.4s, v18.16b, v16.16b\n"
       "add x9, x9, #0x80\n"
       "45:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 33b\n"
-      "ldr q0, [x14, #0x0]\n"
-      "ldr q1, [x14, #0x10]\n"
-      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr q19, [x14, #0x0]\n"
+      "ldr q18, [x14, #0x10]\n"
+      "uzp1 v17.2d, v8.2d, v12.2d\n"
       "uzp2 v8.2d, v8.2d, v12.2d\n"
-      "ldr q2, [x14, #0x20]\n"
-      "ldr q3, [x14, #0x30]\n"
+      "ldr q5, [x14, #0x20]\n"
+      "ldr q16, [x14, #0x30]\n"
       "uzp1 v12.2d, v9.2d, v13.2d\n"
       "uzp2 v9.2d, v9.2d, v13.2d\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "uzp1 v13.2d, v10.2d, v14.2d\n"
       "uzp2 v10.2d, v10.2d, v14.2d\n"
-      "add x24, x11, x20\n"
+      "add x25, x11, x20\n"
       "uzp1 v14.2d, v11.2d, v15.2d\n"
       "uzp2 v11.2d, v11.2d, v15.2d\n"
       "prfm pstl1keep, [x11, #0x0]\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
-      "mov v15.16b, v7.16b\n"
-      "add v15.4s, v15.4s, v0.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "mov v15.16b, v17.16b\n"
+      "add v15.4s, v15.4s, v19.4s\n"
       "add x14, x14, #0x40\n"
-      "add v12.4s, v12.4s, v1.4s\n"
-      "add v13.4s, v13.4s, v2.4s\n"
-      "add v14.4s, v14.4s, v3.4s\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
+      "add v12.4s, v12.4s, v18.4s\n"
+      "add v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v16.4s\n"
+      "add v8.4s, v8.4s, v19.4s\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "add v10.4s, v10.4s, v5.4s\n"
+      "add v11.4s, v11.4s, v16.4s\n"
       "tbz %x[flags], #4, 46f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -668,10 +667,10 @@
       "add x13, x13, #0x40\n"
       "b 47f\n"
       "46:"  // Height 2: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -688,141 +687,141 @@
       "sqrdmulh v10.4s, v10.4s, v6.4s\n"
       "sqrdmulh v11.4s, v11.4s, v7.4s\n"
       "tbz %x[flags], #5, 48f\n"
-      "and v4.16b, v15.16b, v0.16b\n"
-      "and v5.16b, v12.16b, v1.16b\n"
-      "and v6.16b, v13.16b, v2.16b\n"
-      "and v7.16b, v14.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v15.4s, v15.4s, v4.4s\n"
-      "sqadd v12.4s, v12.4s, v5.4s\n"
-      "sqadd v13.4s, v13.4s, v6.4s\n"
-      "sqadd v14.4s, v14.4s, v7.4s\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v19.16b, v15.16b, v0.16b\n"
+      "and v18.16b, v12.16b, v1.16b\n"
+      "and v17.16b, v13.16b, v2.16b\n"
+      "and v16.16b, v14.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v19.4s\n"
+      "sqadd v12.4s, v12.4s, v18.4s\n"
+      "sqadd v13.4s, v13.4s, v17.4s\n"
+      "sqadd v14.4s, v14.4s, v16.4s\n"
+      "and v19.16b, v8.16b, v0.16b\n"
+      "and v18.16b, v9.16b, v1.16b\n"
+      "and v17.16b, v10.16b, v2.16b\n"
+      "and v16.16b, v11.16b, v3.16b\n"
+      "sshr v19.4s, v19.4s, #0x1f\n"
+      "sshr v18.4s, v18.4s, #0x1f\n"
+      "sshr v17.4s, v17.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v19.4s\n"
+      "sqadd v9.4s, v9.4s, v18.4s\n"
+      "sqadd v10.4s, v10.4s, v17.4s\n"
+      "sqadd v11.4s, v11.4s, v16.4s\n"
       "48:"  // Height 2: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v18.4s }, [x20]\n"
       "srshl v15.4s, v15.4s, v0.4s\n"
       "srshl v12.4s, v12.4s, v1.4s\n"
       "srshl v13.4s, v13.4s, v2.4s\n"
       "srshl v14.4s, v14.4s, v3.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v17.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
       "cmp x10, #0x10\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
+      "add v15.4s, v15.4s, v18.4s\n"
+      "add v12.4s, v12.4s, v18.4s\n"
+      "add v13.4s, v13.4s, v18.4s\n"
+      "add v14.4s, v14.4s, v18.4s\n"
+      "add v8.4s, v8.4s, v18.4s\n"
+      "add v9.4s, v9.4s, v18.4s\n"
+      "add v10.4s, v10.4s, v18.4s\n"
+      "add v11.4s, v11.4s, v18.4s\n"
+      "smin v15.4s, v15.4s, v17.4s\n"
+      "smin v12.4s, v12.4s, v17.4s\n"
+      "smin v13.4s, v13.4s, v17.4s\n"
+      "smin v14.4s, v14.4s, v17.4s\n"
+      "smin v8.4s, v8.4s, v17.4s\n"
+      "smin v9.4s, v9.4s, v17.4s\n"
+      "smin v10.4s, v10.4s, v17.4s\n"
+      "smin v11.4s, v11.4s, v17.4s\n"
+      "smax v15.4s, v15.4s, v16.4s\n"
+      "smax v12.4s, v12.4s, v16.4s\n"
+      "smax v13.4s, v13.4s, v16.4s\n"
+      "smax v14.4s, v14.4s, v16.4s\n"
+      "smax v8.4s, v8.4s, v16.4s\n"
+      "smax v9.4s, v9.4s, v16.4s\n"
+      "smax v10.4s, v10.4s, v16.4s\n"
+      "smax v11.4s, v11.4s, v16.4s\n"
       "uzp1 v15.8h, v15.8h, v12.8h\n"
-      "uzp1 v12.8h, v13.8h, v14.8h\n"
+      "uzp1 v17.8h, v13.8h, v14.8h\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
-      "uzp1 v15.16b, v15.16b, v12.16b\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v16.8h, v10.8h, v11.8h\n"
+      "uzp1 v15.16b, v15.16b, v17.16b\n"
+      "uzp1 v8.16b, v8.16b, v16.16b\n"
       "bge 57f\n"
       "tbz x10, #3, 52f\n"
       "str d15, [x11], #0x8\n"
-      "str d8, [x24], #0x8\n"
+      "str d8, [x25], #0x8\n"
       "tbz x10, #2, 50f\n"
       "st1 { v15.s }[2], [x11], #0x4\n"
-      "st1 { v8.s }[2], [x24], #0x4\n"
+      "st1 { v8.s }[2], [x25], #0x4\n"
       "tbz x10, #1, 49f\n"
       "st1 { v15.h }[6], [x11], #0x2\n"
-      "st1 { v8.h }[6], [x24], #0x2\n"
+      "st1 { v8.h }[6], [x25], #0x2\n"
       "tbz x10, #0, 56f\n"
       "st1 { v15.b }[14], [x11]\n"
-      "st1 { v8.b }[14], [x24]\n"
+      "st1 { v8.b }[14], [x25]\n"
       "b 56f\n"
       "49:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x10, #0, 56f\n"
       "st1 { v15.b }[12], [x11]\n"
-      "st1 { v8.b }[12], [x24]\n"
+      "st1 { v8.b }[12], [x25]\n"
       "b 56f\n"
       "50:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x10, #1, 51f\n"
       "st1 { v15.h }[4], [x11], #0x2\n"
-      "st1 { v8.h }[4], [x24], #0x2\n"
+      "st1 { v8.h }[4], [x25], #0x2\n"
       "tbz x10, #0, 56f\n"
       "st1 { v15.b }[10], [x11]\n"
-      "st1 { v8.b }[10], [x24]\n"
+      "st1 { v8.b }[10], [x25]\n"
       "b 56f\n"
       "51:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x10, #0, 56f\n"
       "st1 { v15.b }[8], [x11]\n"
-      "st1 { v8.b }[8], [x24]\n"
+      "st1 { v8.b }[8], [x25]\n"
       "b 56f\n"
       "52:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x10, #2, 54f\n"
       "str s15, [x11], #0x4\n"
-      "str s8, [x24], #0x4\n"
+      "str s8, [x25], #0x4\n"
       "tbz x10, #1, 53f\n"
       "st1 { v15.h }[2], [x11], #0x2\n"
-      "st1 { v8.h }[2], [x24], #0x2\n"
+      "st1 { v8.h }[2], [x25], #0x2\n"
       "tbz x10, #0, 56f\n"
       "st1 { v15.b }[6], [x11]\n"
-      "st1 { v8.b }[6], [x24]\n"
+      "st1 { v8.b }[6], [x25]\n"
       "b 56f\n"
       "53:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x10, #0, 56f\n"
       "st1 { v15.b }[4], [x11]\n"
-      "st1 { v8.b }[4], [x24]\n"
+      "st1 { v8.b }[4], [x25]\n"
       "b 56f\n"
       "54:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x10, #1, 55f\n"
       "str h15, [x11], #0x2\n"
-      "str h8, [x24], #0x2\n"
+      "str h8, [x25], #0x2\n"
       "tbz x10, #0, 56f\n"
       "st1 { v15.b }[2], [x11]\n"
-      "st1 { v8.b }[2], [x24]\n"
+      "st1 { v8.b }[2], [x25]\n"
       "b 56f\n"
       "55:"  // Height 2: Partial direct writeback: partial_1_0
       "str b15, [x11, #0x0]\n"
-      "str b8, [x24, #0x0]\n"
+      "str b8, [x25, #0x0]\n"
       "56:"  // Height 2: Partial direct writeback: Done
       "b 58f\n"
       "57:"  // Height 2: Full writeback
       "str q15, [x11, #0x0]\n"
       "add x11, x11, #0x10\n"
-      "str q8, [x24, #0x0]\n"
+      "str q8, [x25, #0x0]\n"
       "58:"  // Height 2: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 31b\n"
@@ -856,13 +855,13 @@
       "62:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 63f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 64f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -871,8 +870,8 @@
       "b 64f\n"
       "63:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "64:"  // Height 3: input setup done
       "cmp x27, #0x10\n"
       "blt 67f\n"
@@ -884,167 +883,167 @@
       "ldr q6, [x9, #0x10]\n"
       "blt 66f\n"
       "65:"  // Height 3: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a768  // smmla v8.4s, v27.16b, v7.16b\n"
+      "trn1 v26.2d, v3.2d, v28.2d\n"
+      ".inst 0x4e87a750  // smmla v16.4s, v26.16b, v7.16b\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4e86a76c  // smmla v12.4s, v27.16b, v6.16b\n"
+      ".inst 0x4e86a754  // smmla v20.4s, v26.16b, v6.16b\n"
+      "ldr q24, [x9, #0x30]\n"
+      ".inst 0x4e99a769  // smmla v9.4s, v27.16b, v25.16b\n"
+      "trn2 v3.2d, v3.2d, v28.2d\n"
+      ".inst 0x4e99a751  // smmla v17.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e98a76d  // smmla v13.4s, v27.16b, v24.16b\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e98a755  // smmla v21.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x80]\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
       "cmp x27, #0x20\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x90]\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x9, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e99a428  // smmla v8.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a470  // smmla v16.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xa0]\n"
+      ".inst 0x4e98a42c  // smmla v12.4s, v1.16b, v24.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x9, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e98a474  // smmla v20.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xb0]\n"
+      ".inst 0x4e99a429  // smmla v9.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x9, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e99a471  // smmla v17.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xc0]\n"
+      ".inst 0x4e98a42d  // smmla v13.4s, v1.16b, v24.16b\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x9, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x9, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x9, #0xf0]\n"
+      ".inst 0x4e98a475  // smmla v21.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xd0]\n"
+      ".inst 0x4e99a42a  // smmla v10.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a472  // smmla v18.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xe0]\n"
+      ".inst 0x4e98a42e  // smmla v14.4s, v1.16b, v24.16b\n"
+      ".inst 0x4e98a476  // smmla v22.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e99a42b  // smmla v11.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a473  // smmla v19.4s, v3.16b, v25.16b\n"
       "ldr q7, [x9, #0x0]\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e98a42f  // smmla v15.4s, v1.16b, v24.16b\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e98a477  // smmla v23.4s, v3.16b, v24.16b\n"
       "ldr q3, [x24, #0x0]\n"
       "ldr q6, [x9, #0x10]\n"
       "bge 65b\n"
       "66:"  // Height 3: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a768  // smmla v8.4s, v27.16b, v7.16b\n"
+      "trn1 v26.2d, v3.2d, v25.2d\n"
+      ".inst 0x4e87a750  // smmla v16.4s, v26.16b, v7.16b\n"
+      "ldr q24, [x9, #0x20]\n"
+      ".inst 0x4e86a76c  // smmla v12.4s, v27.16b, v6.16b\n"
+      ".inst 0x4e86a754  // smmla v20.4s, v26.16b, v6.16b\n"
+      "ldr q0, [x9, #0x30]\n"
+      ".inst 0x4e98a769  // smmla v9.4s, v27.16b, v24.16b\n"
+      "trn2 v3.2d, v3.2d, v25.2d\n"
+      ".inst 0x4e98a751  // smmla v17.4s, v26.16b, v24.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e80a76d  // smmla v13.4s, v27.16b, v0.16b\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e80a755  // smmla v21.4s, v26.16b, v0.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x80]\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x90]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x90]\n"
+      ".inst 0x4e99a428  // smmla v8.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x9, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e99a470  // smmla v16.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xa0]\n"
+      ".inst 0x4e98a42c  // smmla v12.4s, v1.16b, v24.16b\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x9, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x9, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x9, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x9, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x9, #0xf0]\n"
+      ".inst 0x4e98a474  // smmla v20.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xb0]\n"
+      ".inst 0x4e99a429  // smmla v9.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a471  // smmla v17.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xc0]\n"
+      ".inst 0x4e98a42d  // smmla v13.4s, v1.16b, v24.16b\n"
+      ".inst 0x4e98a475  // smmla v21.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xd0]\n"
+      ".inst 0x4e99a42a  // smmla v10.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a472  // smmla v18.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xe0]\n"
+      ".inst 0x4e98a42e  // smmla v14.4s, v1.16b, v24.16b\n"
+      ".inst 0x4e98a476  // smmla v22.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e99a42b  // smmla v11.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a473  // smmla v19.4s, v3.16b, v25.16b\n"
+      ".inst 0x4e98a42f  // smmla v15.4s, v1.16b, v24.16b\n"
+      ".inst 0x4e98a477  // smmla v23.4s, v3.16b, v24.16b\n"
       "67:"  // Height 3: Multiply loop: Main loop skip
       "cbz x27, 74f\n"
       "cmp x27, #0x8\n"
       "blt 69f\n"
       "68:"  // Height 3: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr q6, [x9, #0x0]\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      "ldr d25, [x26], #0x8\n"
+      "ldr d24, [x25], #0x8\n"
+      "trn1 v27.2d, v25.2d, v24.2d\n"
+      "ldr d24, [x24], #0x8\n"
+      "ldr q25, [x9, #0x0]\n"
+      "trn1 v26.2d, v24.2d, v26.2d\n"
+      ".inst 0x4e99a768  // smmla v8.4s, v27.16b, v25.16b\n"
+      "ldr q24, [x9, #0x10]\n"
+      ".inst 0x4e99a750  // smmla v16.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4e98a76c  // smmla v12.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a754  // smmla v20.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x30]\n"
+      ".inst 0x4e99a769  // smmla v9.4s, v27.16b, v25.16b\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e99a751  // smmla v17.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e98a76d  // smmla v13.4s, v27.16b, v24.16b\n"
       "cmp x27, #0x8\n"
-      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e98a755  // smmla v21.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
       "add x9, x9, #0x80\n"
-      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
       "bge 68b\n"
       "69:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x27, 74f\n"
@@ -1082,74 +1081,74 @@
       "ldr b2, [x25, #0x0]\n"
       "ldr b3, [x24, #0x0]\n"
       "73:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x9, #0x0]\n"
-      "ldr q6, [x9, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x70]\n"
+      "ldr q25, [x9, #0x0]\n"
+      "ldr q28, [x9, #0x10]\n"
+      "trn1 v27.2d, v1.2d, v2.2d\n"
+      "trn1 v26.2d, v3.2d, v24.2d\n"
+      ".inst 0x4e99a768  // smmla v8.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a750  // smmla v16.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4e9ca76c  // smmla v12.4s, v27.16b, v28.16b\n"
+      ".inst 0x4e9ca754  // smmla v20.4s, v26.16b, v28.16b\n"
+      "ldr q24, [x9, #0x30]\n"
+      ".inst 0x4e99a769  // smmla v9.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a751  // smmla v17.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e98a76d  // smmla v13.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a755  // smmla v21.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
       "add x9, x9, #0x80\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
       "74:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 62b\n"
-      "ldr q0, [x14, #0x0]\n"
-      "ldr q1, [x14, #0x10]\n"
-      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr q28, [x14, #0x0]\n"
+      "ldr q27, [x14, #0x10]\n"
+      "uzp1 v26.2d, v8.2d, v12.2d\n"
       "uzp2 v8.2d, v8.2d, v12.2d\n"
-      "ldr q2, [x14, #0x20]\n"
-      "ldr q3, [x14, #0x30]\n"
+      "ldr q25, [x14, #0x20]\n"
+      "ldr q24, [x14, #0x30]\n"
       "uzp1 v12.2d, v9.2d, v13.2d\n"
       "uzp2 v9.2d, v9.2d, v13.2d\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "uzp1 v13.2d, v10.2d, v14.2d\n"
       "uzp2 v10.2d, v10.2d, v14.2d\n"
-      "add x24, x11, x20\n"
+      "add x25, x11, x20\n"
       "uzp1 v14.2d, v11.2d, v15.2d\n"
       "uzp2 v11.2d, v11.2d, v15.2d\n"
-      "add x23, x24, x20\n"
+      "add x24, x25, x20\n"
       "prfm pstl1keep, [x11, #0x0]\n"
       "uzp1 v16.2d, v16.2d, v20.2d\n"
       "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
       "uzp1 v18.2d, v18.2d, v22.2d\n"
       "uzp1 v19.2d, v19.2d, v23.2d\n"
       "add x14, x14, #0x40\n"
-      "mov v23.16b, v7.16b\n"
-      "add v23.4s, v23.4s, v0.4s\n"
-      "add v12.4s, v12.4s, v1.4s\n"
-      "add v13.4s, v13.4s, v2.4s\n"
-      "add v14.4s, v14.4s, v3.4s\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "mov v23.16b, v26.16b\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v12.4s, v12.4s, v27.4s\n"
+      "add v13.4s, v13.4s, v25.4s\n"
+      "add v14.4s, v14.4s, v24.4s\n"
+      "add v8.4s, v8.4s, v28.4s\n"
+      "add v9.4s, v9.4s, v27.4s\n"
+      "add v10.4s, v10.4s, v25.4s\n"
+      "add v11.4s, v11.4s, v24.4s\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "add v17.4s, v17.4s, v27.4s\n"
+      "add v18.4s, v18.4s, v25.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
       "tbz %x[flags], #4, 75f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -1163,10 +1162,10 @@
       "add x13, x13, #0x40\n"
       "b 76f\n"
       "75:"  // Height 3: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -1187,55 +1186,55 @@
       "sqrdmulh v18.4s, v18.4s, v6.4s\n"
       "sqrdmulh v19.4s, v19.4s, v7.4s\n"
       "tbz %x[flags], #5, 77f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "and v5.16b, v12.16b, v1.16b\n"
-      "and v6.16b, v13.16b, v2.16b\n"
-      "and v7.16b, v14.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sqadd v12.4s, v12.4s, v5.4s\n"
-      "sqadd v13.4s, v13.4s, v6.4s\n"
-      "sqadd v14.4s, v14.4s, v7.4s\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v24.16b, v23.16b, v0.16b\n"
+      "and v22.16b, v12.16b, v1.16b\n"
+      "and v21.16b, v13.16b, v2.16b\n"
+      "and v20.16b, v14.16b, v3.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
+      "sqadd v12.4s, v12.4s, v22.4s\n"
+      "sqadd v13.4s, v13.4s, v21.4s\n"
+      "sqadd v14.4s, v14.4s, v20.4s\n"
+      "and v24.16b, v8.16b, v0.16b\n"
+      "and v22.16b, v9.16b, v1.16b\n"
+      "and v21.16b, v10.16b, v2.16b\n"
+      "and v20.16b, v11.16b, v3.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v24.4s\n"
+      "sqadd v9.4s, v9.4s, v22.4s\n"
+      "sqadd v10.4s, v10.4s, v21.4s\n"
+      "sqadd v11.4s, v11.4s, v20.4s\n"
+      "and v24.16b, v16.16b, v0.16b\n"
+      "and v22.16b, v17.16b, v1.16b\n"
+      "and v21.16b, v18.16b, v2.16b\n"
+      "and v20.16b, v19.16b, v3.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v24.4s\n"
+      "sqadd v17.4s, v17.4s, v22.4s\n"
+      "sqadd v18.4s, v18.4s, v21.4s\n"
+      "sqadd v19.4s, v19.4s, v20.4s\n"
       "77:"  // Height 3: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v22.4s }, [x20]\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
       "srshl v12.4s, v12.4s, v1.4s\n"
       "srshl v13.4s, v13.4s, v2.4s\n"
       "srshl v14.4s, v14.4s, v3.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v21.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v20.4s }, [x20]\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
       "cmp x10, #0x10\n"
@@ -1243,132 +1242,132 @@
       "srshl v17.4s, v17.4s, v1.4s\n"
       "srshl v18.4s, v18.4s, v2.4s\n"
       "srshl v19.4s, v19.4s, v3.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "add v23.4s, v23.4s, v22.4s\n"
+      "add v12.4s, v12.4s, v22.4s\n"
+      "add v13.4s, v13.4s, v22.4s\n"
+      "add v14.4s, v14.4s, v22.4s\n"
+      "add v8.4s, v8.4s, v22.4s\n"
+      "add v9.4s, v9.4s, v22.4s\n"
+      "add v10.4s, v10.4s, v22.4s\n"
+      "add v11.4s, v11.4s, v22.4s\n"
+      "add v16.4s, v16.4s, v22.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v22.4s\n"
+      "add v19.4s, v19.4s, v22.4s\n"
+      "smin v23.4s, v23.4s, v21.4s\n"
+      "smin v12.4s, v12.4s, v21.4s\n"
+      "smin v13.4s, v13.4s, v21.4s\n"
+      "smin v14.4s, v14.4s, v21.4s\n"
+      "smin v8.4s, v8.4s, v21.4s\n"
+      "smin v9.4s, v9.4s, v21.4s\n"
+      "smin v10.4s, v10.4s, v21.4s\n"
+      "smin v11.4s, v11.4s, v21.4s\n"
+      "smin v16.4s, v16.4s, v21.4s\n"
+      "smin v17.4s, v17.4s, v21.4s\n"
+      "smin v18.4s, v18.4s, v21.4s\n"
+      "smin v19.4s, v19.4s, v21.4s\n"
+      "smax v23.4s, v23.4s, v20.4s\n"
+      "smax v12.4s, v12.4s, v20.4s\n"
+      "smax v13.4s, v13.4s, v20.4s\n"
+      "smax v14.4s, v14.4s, v20.4s\n"
+      "smax v8.4s, v8.4s, v20.4s\n"
+      "smax v9.4s, v9.4s, v20.4s\n"
+      "smax v10.4s, v10.4s, v20.4s\n"
+      "smax v11.4s, v11.4s, v20.4s\n"
+      "smax v16.4s, v16.4s, v20.4s\n"
+      "smax v17.4s, v17.4s, v20.4s\n"
+      "smax v18.4s, v18.4s, v20.4s\n"
+      "smax v19.4s, v19.4s, v20.4s\n"
       "uzp1 v23.8h, v23.8h, v12.8h\n"
-      "uzp1 v12.8h, v13.8h, v14.8h\n"
+      "uzp1 v21.8h, v13.8h, v14.8h\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v20.8h, v10.8h, v11.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "uzp1 v23.16b, v23.16b, v12.16b\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v23.16b, v23.16b, v21.16b\n"
+      "uzp1 v8.16b, v8.16b, v20.16b\n"
       "uzp1 v16.16b, v16.16b, v17.16b\n"
       "bge 86f\n"
       "tbz x10, #3, 81f\n"
       "str d23, [x11], #0x8\n"
-      "str d8, [x24], #0x8\n"
-      "str d16, [x23], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d16, [x24], #0x8\n"
       "tbz x10, #2, 79f\n"
       "st1 { v23.s }[2], [x11], #0x4\n"
-      "st1 { v8.s }[2], [x24], #0x4\n"
-      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v8.s }[2], [x25], #0x4\n"
+      "st1 { v16.s }[2], [x24], #0x4\n"
       "tbz x10, #1, 78f\n"
       "st1 { v23.h }[6], [x11], #0x2\n"
-      "st1 { v8.h }[6], [x24], #0x2\n"
-      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v8.h }[6], [x25], #0x2\n"
+      "st1 { v16.h }[6], [x24], #0x2\n"
       "tbz x10, #0, 85f\n"
       "st1 { v23.b }[14], [x11]\n"
-      "st1 { v8.b }[14], [x24]\n"
-      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v8.b }[14], [x25]\n"
+      "st1 { v16.b }[14], [x24]\n"
       "b 85f\n"
       "78:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x10, #0, 85f\n"
       "st1 { v23.b }[12], [x11]\n"
-      "st1 { v8.b }[12], [x24]\n"
-      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v8.b }[12], [x25]\n"
+      "st1 { v16.b }[12], [x24]\n"
       "b 85f\n"
       "79:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x10, #1, 80f\n"
       "st1 { v23.h }[4], [x11], #0x2\n"
-      "st1 { v8.h }[4], [x24], #0x2\n"
-      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v8.h }[4], [x25], #0x2\n"
+      "st1 { v16.h }[4], [x24], #0x2\n"
       "tbz x10, #0, 85f\n"
       "st1 { v23.b }[10], [x11]\n"
-      "st1 { v8.b }[10], [x24]\n"
-      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v8.b }[10], [x25]\n"
+      "st1 { v16.b }[10], [x24]\n"
       "b 85f\n"
       "80:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x10, #0, 85f\n"
       "st1 { v23.b }[8], [x11]\n"
-      "st1 { v8.b }[8], [x24]\n"
-      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v8.b }[8], [x25]\n"
+      "st1 { v16.b }[8], [x24]\n"
       "b 85f\n"
       "81:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x10, #2, 83f\n"
       "str s23, [x11], #0x4\n"
-      "str s8, [x24], #0x4\n"
-      "str s16, [x23], #0x4\n"
+      "str s8, [x25], #0x4\n"
+      "str s16, [x24], #0x4\n"
       "tbz x10, #1, 82f\n"
       "st1 { v23.h }[2], [x11], #0x2\n"
-      "st1 { v8.h }[2], [x24], #0x2\n"
-      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v8.h }[2], [x25], #0x2\n"
+      "st1 { v16.h }[2], [x24], #0x2\n"
       "tbz x10, #0, 85f\n"
       "st1 { v23.b }[6], [x11]\n"
-      "st1 { v8.b }[6], [x24]\n"
-      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v8.b }[6], [x25]\n"
+      "st1 { v16.b }[6], [x24]\n"
       "b 85f\n"
       "82:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x10, #0, 85f\n"
       "st1 { v23.b }[4], [x11]\n"
-      "st1 { v8.b }[4], [x24]\n"
-      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v8.b }[4], [x25]\n"
+      "st1 { v16.b }[4], [x24]\n"
       "b 85f\n"
       "83:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x10, #1, 84f\n"
       "str h23, [x11], #0x2\n"
-      "str h8, [x24], #0x2\n"
-      "str h16, [x23], #0x2\n"
+      "str h8, [x25], #0x2\n"
+      "str h16, [x24], #0x2\n"
       "tbz x10, #0, 85f\n"
       "st1 { v23.b }[2], [x11]\n"
-      "st1 { v8.b }[2], [x24]\n"
-      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v8.b }[2], [x25]\n"
+      "st1 { v16.b }[2], [x24]\n"
       "b 85f\n"
       "84:"  // Height 3: Partial direct writeback: partial_1_0
       "str b23, [x11, #0x0]\n"
-      "str b8, [x24, #0x0]\n"
-      "str b16, [x23, #0x0]\n"
+      "str b8, [x25, #0x0]\n"
+      "str b16, [x24, #0x0]\n"
       "85:"  // Height 3: Partial direct writeback: Done
       "b 87f\n"
       "86:"  // Height 3: Full writeback
       "str q23, [x11, #0x0]\n"
       "add x11, x11, #0x10\n"
-      "str q8, [x24, #0x0]\n"
-      "str q16, [x23, #0x0]\n"
+      "str q8, [x25, #0x0]\n"
+      "str q16, [x24, #0x0]\n"
       "87:"  // Height 3: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 60b\n"
@@ -1402,14 +1401,14 @@
       "91:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 92f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 93f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1419,9 +1418,9 @@
       "b 93f\n"
       "92:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "93:"  // Height 4: input setup done
       "cmp x27, #0x10\n"
       "blt 96f\n"
@@ -1434,173 +1433,173 @@
       "ldr q6, [x9, #0x10]\n"
       "blt 95f\n"
       "94:"  // Height 4: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a768  // smmla v8.4s, v27.16b, v7.16b\n"
       "sub x27, x27, #0x10\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "trn1 v26.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a750  // smmla v16.4s, v26.16b, v7.16b\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4e86a76c  // smmla v12.4s, v27.16b, v6.16b\n"
+      ".inst 0x4e86a754  // smmla v20.4s, v26.16b, v6.16b\n"
+      "ldr q24, [x9, #0x30]\n"
+      ".inst 0x4e99a769  // smmla v9.4s, v27.16b, v25.16b\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e99a751  // smmla v17.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e98a76d  // smmla v13.4s, v27.16b, v24.16b\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e98a755  // smmla v21.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
       "add x23, x23, #0x10\n"
       "ldr q4, [x23, #0x0]\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x80]\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x90]\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x9, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e99a428  // smmla v8.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a470  // smmla v16.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xa0]\n"
+      ".inst 0x4e98a42c  // smmla v12.4s, v1.16b, v24.16b\n"
       "cmp x27, #0x20\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x9, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e98a474  // smmla v20.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xb0]\n"
+      ".inst 0x4e99a429  // smmla v9.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x9, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e99a471  // smmla v17.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xc0]\n"
+      ".inst 0x4e98a42d  // smmla v13.4s, v1.16b, v24.16b\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x9, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e98a475  // smmla v21.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xd0]\n"
+      ".inst 0x4e99a42a  // smmla v10.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x9, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e99a472  // smmla v18.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xe0]\n"
+      ".inst 0x4e98a42e  // smmla v14.4s, v1.16b, v24.16b\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x9, #0xf0]\n"
+      ".inst 0x4e98a476  // smmla v22.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e99a42b  // smmla v11.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a473  // smmla v19.4s, v3.16b, v25.16b\n"
       "ldr q7, [x9, #0x0]\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e98a42f  // smmla v15.4s, v1.16b, v24.16b\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e98a477  // smmla v23.4s, v3.16b, v24.16b\n"
       "ldr q3, [x24, #0x0]\n"
       "ldr q6, [x9, #0x10]\n"
       "bge 94b\n"
       "95:"  // Height 4: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a768  // smmla v8.4s, v27.16b, v7.16b\n"
       "add x26, x26, #0x10\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "trn1 v26.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a750  // smmla v16.4s, v26.16b, v7.16b\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4e86a76c  // smmla v12.4s, v27.16b, v6.16b\n"
+      ".inst 0x4e86a754  // smmla v20.4s, v26.16b, v6.16b\n"
+      "ldr q24, [x9, #0x30]\n"
+      ".inst 0x4e99a769  // smmla v9.4s, v27.16b, v25.16b\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e99a751  // smmla v17.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e98a76d  // smmla v13.4s, v27.16b, v24.16b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e98a755  // smmla v21.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x80]\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x90]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x90]\n"
+      ".inst 0x4e99a428  // smmla v8.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x9, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e99a470  // smmla v16.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xa0]\n"
+      ".inst 0x4e98a42c  // smmla v12.4s, v1.16b, v24.16b\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x9, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e98a474  // smmla v20.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xb0]\n"
+      ".inst 0x4e99a429  // smmla v9.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x9, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x9, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x9, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x9, #0xf0]\n"
+      ".inst 0x4e99a471  // smmla v17.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xc0]\n"
+      ".inst 0x4e98a42d  // smmla v13.4s, v1.16b, v24.16b\n"
+      ".inst 0x4e98a475  // smmla v21.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xd0]\n"
+      ".inst 0x4e99a42a  // smmla v10.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a472  // smmla v18.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x9, #0xe0]\n"
+      ".inst 0x4e98a42e  // smmla v14.4s, v1.16b, v24.16b\n"
+      ".inst 0x4e98a476  // smmla v22.4s, v3.16b, v24.16b\n"
+      "ldr q24, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e99a42b  // smmla v11.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a473  // smmla v19.4s, v3.16b, v25.16b\n"
+      ".inst 0x4e98a42f  // smmla v15.4s, v1.16b, v24.16b\n"
+      ".inst 0x4e98a477  // smmla v23.4s, v3.16b, v24.16b\n"
       "96:"  // Height 4: Multiply loop: Main loop skip
       "cbz x27, 103f\n"
       "cmp x27, #0x8\n"
       "blt 98f\n"
       "97:"  // Height 4: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d25, [x26], #0x8\n"
+      "ldr d24, [x25], #0x8\n"
+      "trn1 v27.2d, v25.2d, v24.2d\n"
       "sub x27, x27, #0x8\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "trn1 v26.2d, v25.2d, v24.2d\n"
       "cmp x27, #0x8\n"
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x70]\n"
+      "ldr q25, [x9, #0x0]\n"
+      "ldr q24, [x9, #0x10]\n"
+      ".inst 0x4e99a768  // smmla v8.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a750  // smmla v16.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4e98a76c  // smmla v12.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a754  // smmla v20.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x30]\n"
+      ".inst 0x4e99a769  // smmla v9.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a751  // smmla v17.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e98a76d  // smmla v13.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a755  // smmla v21.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
       "add x9, x9, #0x80\n"
-      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
       "bge 97b\n"
       "98:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x27, 103f\n"
@@ -1645,84 +1644,84 @@
       "ldr b3, [x24, #0x0]\n"
       "ldr b4, [x23, #0x0]\n"
       "102:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x9, #0x0]\n"
-      "ldr q6, [x9, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x9, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x9, #0x70]\n"
+      "ldr q25, [x9, #0x0]\n"
+      "ldr q24, [x9, #0x10]\n"
+      "trn1 v27.2d, v1.2d, v2.2d\n"
+      "trn1 v26.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e99a768  // smmla v8.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a750  // smmla v16.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x20]\n"
+      ".inst 0x4e98a76c  // smmla v12.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a754  // smmla v20.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x30]\n"
+      ".inst 0x4e99a769  // smmla v9.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a751  // smmla v17.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x40]\n"
+      ".inst 0x4e98a76d  // smmla v13.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a755  // smmla v21.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x50]\n"
+      ".inst 0x4e99a76a  // smmla v10.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a752  // smmla v18.4s, v26.16b, v25.16b\n"
+      "ldr q25, [x9, #0x60]\n"
+      ".inst 0x4e98a76e  // smmla v14.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a756  // smmla v22.4s, v26.16b, v24.16b\n"
+      "ldr q24, [x9, #0x70]\n"
       "add x9, x9, #0x80\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e99a76b  // smmla v11.4s, v27.16b, v25.16b\n"
+      ".inst 0x4e99a753  // smmla v19.4s, v26.16b, v25.16b\n"
+      ".inst 0x4e98a76f  // smmla v15.4s, v27.16b, v24.16b\n"
+      ".inst 0x4e98a757  // smmla v23.4s, v26.16b, v24.16b\n"
       "103:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 91b\n"
-      "ldr q0, [x14, #0x0]\n"
-      "ldr q1, [x14, #0x10]\n"
-      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr q28, [x14, #0x0]\n"
+      "ldr q27, [x14, #0x10]\n"
+      "uzp1 v26.2d, v8.2d, v12.2d\n"
       "uzp2 v8.2d, v8.2d, v12.2d\n"
-      "ldr q2, [x14, #0x20]\n"
-      "ldr q3, [x14, #0x30]\n"
+      "ldr q25, [x14, #0x20]\n"
+      "ldr q24, [x14, #0x30]\n"
       "uzp1 v12.2d, v9.2d, v13.2d\n"
       "uzp2 v9.2d, v9.2d, v13.2d\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "uzp1 v13.2d, v10.2d, v14.2d\n"
       "uzp2 v10.2d, v10.2d, v14.2d\n"
-      "add x24, x11, x20\n"
+      "add x25, x11, x20\n"
       "uzp1 v14.2d, v11.2d, v15.2d\n"
       "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "add x24, x25, x20\n"
       "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
       "uzp1 v15.2d, v16.2d, v20.2d\n"
       "uzp2 v16.2d, v16.2d, v20.2d\n"
       "prfm pstl1keep, [x11, #0x0]\n"
-      "prfm pstl1keep, [x24, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "uzp1 v20.2d, v17.2d, v21.2d\n"
       "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
       "prfm pstl1keep, [x23, #0x0]\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
       "uzp1 v21.2d, v18.2d, v22.2d\n"
       "uzp2 v18.2d, v18.2d, v22.2d\n"
       "add x14, x14, #0x40\n"
       "uzp1 v22.2d, v19.2d, v23.2d\n"
       "uzp2 v19.2d, v19.2d, v23.2d\n"
-      "mov v23.16b, v7.16b\n"
-      "add v23.4s, v23.4s, v0.4s\n"
-      "add v12.4s, v12.4s, v1.4s\n"
-      "add v13.4s, v13.4s, v2.4s\n"
-      "add v14.4s, v14.4s, v3.4s\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "add v15.4s, v15.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v1.4s\n"
-      "add v21.4s, v21.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v3.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "mov v23.16b, v26.16b\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v12.4s, v12.4s, v27.4s\n"
+      "add v13.4s, v13.4s, v25.4s\n"
+      "add v14.4s, v14.4s, v24.4s\n"
+      "add v8.4s, v8.4s, v28.4s\n"
+      "add v9.4s, v9.4s, v27.4s\n"
+      "add v10.4s, v10.4s, v25.4s\n"
+      "add v11.4s, v11.4s, v24.4s\n"
+      "add v15.4s, v15.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v27.4s\n"
+      "add v21.4s, v21.4s, v25.4s\n"
+      "add v22.4s, v22.4s, v24.4s\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "add v17.4s, v17.4s, v27.4s\n"
+      "add v18.4s, v18.4s, v25.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
       "tbz %x[flags], #4, 104f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -1736,10 +1735,10 @@
       "add x13, x13, #0x40\n"
       "b 105f\n"
       "104:"  // Height 4: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -1764,67 +1763,67 @@
       "sqrdmulh v18.4s, v18.4s, v6.4s\n"
       "sqrdmulh v19.4s, v19.4s, v7.4s\n"
       "tbz %x[flags], #5, 106f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "and v5.16b, v12.16b, v1.16b\n"
-      "and v6.16b, v13.16b, v2.16b\n"
-      "and v7.16b, v14.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sqadd v12.4s, v12.4s, v5.4s\n"
-      "sqadd v13.4s, v13.4s, v6.4s\n"
-      "sqadd v14.4s, v14.4s, v7.4s\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v15.16b, v0.16b\n"
-      "and v5.16b, v20.16b, v1.16b\n"
-      "and v6.16b, v21.16b, v2.16b\n"
-      "and v7.16b, v22.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v15.4s, v15.4s, v4.4s\n"
-      "sqadd v20.4s, v20.4s, v5.4s\n"
-      "sqadd v21.4s, v21.4s, v6.4s\n"
-      "sqadd v22.4s, v22.4s, v7.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v27.16b, v23.16b, v0.16b\n"
+      "and v26.16b, v12.16b, v1.16b\n"
+      "and v25.16b, v13.16b, v2.16b\n"
+      "and v24.16b, v14.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v27.4s\n"
+      "sqadd v12.4s, v12.4s, v26.4s\n"
+      "sqadd v13.4s, v13.4s, v25.4s\n"
+      "sqadd v14.4s, v14.4s, v24.4s\n"
+      "and v27.16b, v8.16b, v0.16b\n"
+      "and v26.16b, v9.16b, v1.16b\n"
+      "and v25.16b, v10.16b, v2.16b\n"
+      "and v24.16b, v11.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v27.4s\n"
+      "sqadd v9.4s, v9.4s, v26.4s\n"
+      "sqadd v10.4s, v10.4s, v25.4s\n"
+      "sqadd v11.4s, v11.4s, v24.4s\n"
+      "and v27.16b, v15.16b, v0.16b\n"
+      "and v26.16b, v20.16b, v1.16b\n"
+      "and v25.16b, v21.16b, v2.16b\n"
+      "and v24.16b, v22.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v27.4s\n"
+      "sqadd v20.4s, v20.4s, v26.4s\n"
+      "sqadd v21.4s, v21.4s, v25.4s\n"
+      "sqadd v22.4s, v22.4s, v24.4s\n"
+      "and v27.16b, v16.16b, v0.16b\n"
+      "and v26.16b, v17.16b, v1.16b\n"
+      "and v25.16b, v18.16b, v2.16b\n"
+      "and v24.16b, v19.16b, v3.16b\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v27.4s\n"
+      "sqadd v17.4s, v17.4s, v26.4s\n"
+      "sqadd v18.4s, v18.4s, v25.4s\n"
+      "sqadd v19.4s, v19.4s, v24.4s\n"
       "106:"  // Height 4: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
       "srshl v12.4s, v12.4s, v1.4s\n"
       "srshl v13.4s, v13.4s, v2.4s\n"
       "srshl v14.4s, v14.4s, v3.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v25.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
       "cmp x10, #0x10\n"
@@ -1836,163 +1835,163 @@
       "srshl v17.4s, v17.4s, v1.4s\n"
       "srshl v18.4s, v18.4s, v2.4s\n"
       "srshl v19.4s, v19.4s, v3.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "add v23.4s, v23.4s, v26.4s\n"
+      "add v12.4s, v12.4s, v26.4s\n"
+      "add v13.4s, v13.4s, v26.4s\n"
+      "add v14.4s, v14.4s, v26.4s\n"
+      "add v8.4s, v8.4s, v26.4s\n"
+      "add v9.4s, v9.4s, v26.4s\n"
+      "add v10.4s, v10.4s, v26.4s\n"
+      "add v11.4s, v11.4s, v26.4s\n"
+      "add v15.4s, v15.4s, v26.4s\n"
+      "add v20.4s, v20.4s, v26.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v16.4s, v16.4s, v26.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v26.4s\n"
+      "smin v23.4s, v23.4s, v25.4s\n"
+      "smin v12.4s, v12.4s, v25.4s\n"
+      "smin v13.4s, v13.4s, v25.4s\n"
+      "smin v14.4s, v14.4s, v25.4s\n"
+      "smin v8.4s, v8.4s, v25.4s\n"
+      "smin v9.4s, v9.4s, v25.4s\n"
+      "smin v10.4s, v10.4s, v25.4s\n"
+      "smin v11.4s, v11.4s, v25.4s\n"
+      "smin v15.4s, v15.4s, v25.4s\n"
+      "smin v20.4s, v20.4s, v25.4s\n"
+      "smin v21.4s, v21.4s, v25.4s\n"
+      "smin v22.4s, v22.4s, v25.4s\n"
+      "smin v16.4s, v16.4s, v25.4s\n"
+      "smin v17.4s, v17.4s, v25.4s\n"
+      "smin v18.4s, v18.4s, v25.4s\n"
+      "smin v19.4s, v19.4s, v25.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
+      "smax v12.4s, v12.4s, v24.4s\n"
+      "smax v13.4s, v13.4s, v24.4s\n"
+      "smax v14.4s, v14.4s, v24.4s\n"
+      "smax v8.4s, v8.4s, v24.4s\n"
+      "smax v9.4s, v9.4s, v24.4s\n"
+      "smax v10.4s, v10.4s, v24.4s\n"
+      "smax v11.4s, v11.4s, v24.4s\n"
+      "smax v15.4s, v15.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
       "uzp1 v23.8h, v23.8h, v12.8h\n"
-      "uzp1 v12.8h, v13.8h, v14.8h\n"
+      "uzp1 v25.8h, v13.8h, v14.8h\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v24.8h, v10.8h, v11.8h\n"
       "uzp1 v15.8h, v15.8h, v20.8h\n"
       "uzp1 v20.8h, v21.8h, v22.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "uzp1 v23.16b, v23.16b, v12.16b\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v23.16b, v23.16b, v25.16b\n"
+      "uzp1 v8.16b, v8.16b, v24.16b\n"
       "uzp1 v15.16b, v15.16b, v20.16b\n"
       "uzp1 v16.16b, v16.16b, v17.16b\n"
       "bge 115f\n"
       "tbz x10, #3, 110f\n"
       "str d23, [x11], #0x8\n"
-      "str d8, [x24], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
       "tbz x10, #2, 108f\n"
       "st1 { v23.s }[2], [x11], #0x4\n"
-      "st1 { v8.s }[2], [x24], #0x4\n"
-      "st1 { v15.s }[2], [x23], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
+      "st1 { v8.s }[2], [x25], #0x4\n"
+      "st1 { v15.s }[2], [x24], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
       "tbz x10, #1, 107f\n"
       "st1 { v23.h }[6], [x11], #0x2\n"
-      "st1 { v8.h }[6], [x24], #0x2\n"
-      "st1 { v15.h }[6], [x23], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
+      "st1 { v8.h }[6], [x25], #0x2\n"
+      "st1 { v15.h }[6], [x24], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
       "tbz x10, #0, 114f\n"
       "st1 { v23.b }[14], [x11]\n"
-      "st1 { v8.b }[14], [x24]\n"
-      "st1 { v15.b }[14], [x23]\n"
-      "st1 { v16.b }[14], [x22]\n"
+      "st1 { v8.b }[14], [x25]\n"
+      "st1 { v15.b }[14], [x24]\n"
+      "st1 { v16.b }[14], [x23]\n"
       "b 114f\n"
       "107:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x10, #0, 114f\n"
       "st1 { v23.b }[12], [x11]\n"
-      "st1 { v8.b }[12], [x24]\n"
-      "st1 { v15.b }[12], [x23]\n"
-      "st1 { v16.b }[12], [x22]\n"
+      "st1 { v8.b }[12], [x25]\n"
+      "st1 { v15.b }[12], [x24]\n"
+      "st1 { v16.b }[12], [x23]\n"
       "b 114f\n"
       "108:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x10, #1, 109f\n"
       "st1 { v23.h }[4], [x11], #0x2\n"
-      "st1 { v8.h }[4], [x24], #0x2\n"
-      "st1 { v15.h }[4], [x23], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
+      "st1 { v8.h }[4], [x25], #0x2\n"
+      "st1 { v15.h }[4], [x24], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
       "tbz x10, #0, 114f\n"
       "st1 { v23.b }[10], [x11]\n"
-      "st1 { v8.b }[10], [x24]\n"
-      "st1 { v15.b }[10], [x23]\n"
-      "st1 { v16.b }[10], [x22]\n"
+      "st1 { v8.b }[10], [x25]\n"
+      "st1 { v15.b }[10], [x24]\n"
+      "st1 { v16.b }[10], [x23]\n"
       "b 114f\n"
       "109:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x10, #0, 114f\n"
       "st1 { v23.b }[8], [x11]\n"
-      "st1 { v8.b }[8], [x24]\n"
-      "st1 { v15.b }[8], [x23]\n"
-      "st1 { v16.b }[8], [x22]\n"
+      "st1 { v8.b }[8], [x25]\n"
+      "st1 { v15.b }[8], [x24]\n"
+      "st1 { v16.b }[8], [x23]\n"
       "b 114f\n"
       "110:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x10, #2, 112f\n"
       "str s23, [x11], #0x4\n"
-      "str s8, [x24], #0x4\n"
-      "str s15, [x23], #0x4\n"
-      "str s16, [x22], #0x4\n"
+      "str s8, [x25], #0x4\n"
+      "str s15, [x24], #0x4\n"
+      "str s16, [x23], #0x4\n"
       "tbz x10, #1, 111f\n"
       "st1 { v23.h }[2], [x11], #0x2\n"
-      "st1 { v8.h }[2], [x24], #0x2\n"
-      "st1 { v15.h }[2], [x23], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
+      "st1 { v8.h }[2], [x25], #0x2\n"
+      "st1 { v15.h }[2], [x24], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
       "tbz x10, #0, 114f\n"
       "st1 { v23.b }[6], [x11]\n"
-      "st1 { v8.b }[6], [x24]\n"
-      "st1 { v15.b }[6], [x23]\n"
-      "st1 { v16.b }[6], [x22]\n"
+      "st1 { v8.b }[6], [x25]\n"
+      "st1 { v15.b }[6], [x24]\n"
+      "st1 { v16.b }[6], [x23]\n"
       "b 114f\n"
       "111:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x10, #0, 114f\n"
       "st1 { v23.b }[4], [x11]\n"
-      "st1 { v8.b }[4], [x24]\n"
-      "st1 { v15.b }[4], [x23]\n"
-      "st1 { v16.b }[4], [x22]\n"
+      "st1 { v8.b }[4], [x25]\n"
+      "st1 { v15.b }[4], [x24]\n"
+      "st1 { v16.b }[4], [x23]\n"
       "b 114f\n"
       "112:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x10, #1, 113f\n"
       "str h23, [x11], #0x2\n"
-      "str h8, [x24], #0x2\n"
-      "str h15, [x23], #0x2\n"
-      "str h16, [x22], #0x2\n"
+      "str h8, [x25], #0x2\n"
+      "str h15, [x24], #0x2\n"
+      "str h16, [x23], #0x2\n"
       "tbz x10, #0, 114f\n"
       "st1 { v23.b }[2], [x11]\n"
-      "st1 { v8.b }[2], [x24]\n"
-      "st1 { v15.b }[2], [x23]\n"
-      "st1 { v16.b }[2], [x22]\n"
+      "st1 { v8.b }[2], [x25]\n"
+      "st1 { v15.b }[2], [x24]\n"
+      "st1 { v16.b }[2], [x23]\n"
       "b 114f\n"
       "113:"  // Height 4: Partial direct writeback: partial_1_0
       "str b23, [x11, #0x0]\n"
-      "str b8, [x24, #0x0]\n"
-      "str b15, [x23, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
+      "str b8, [x25, #0x0]\n"
+      "str b15, [x24, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
       "114:"  // Height 4: Partial direct writeback: Done
       "b 116f\n"
       "115:"  // Height 4: Full writeback
       "str q23, [x11, #0x0]\n"
       "add x11, x11, #0x10\n"
-      "str q8, [x24, #0x0]\n"
-      "str q15, [x23, #0x0]\n"
-      "str q16, [x22, #0x0]\n"
+      "str q8, [x25, #0x0]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q16, [x23, #0x0]\n"
       "116:"  // Height 4: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 89b\n"
@@ -2034,15 +2033,15 @@
       "120:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 121f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 122f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -2053,10 +2052,10 @@
       "b 122f\n"
       "121:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "122:"  // Height 5: input setup done
       "cmp x27, #0x10\n"
       "blt 125f\n"
@@ -2120,42 +2119,42 @@
       ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
       "ldr q2, [x25, #0x0]\n"
       ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x90]\n"
+      "ldr q0, [x9, #0x90]\n"
       "ldr q4, [x23, #0x0]\n"
       ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
       ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
       ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x9, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x9, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x9, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x9, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x9, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x9, #0xf0]\n"
+      "ldr q6, [x9, #0xa0]\n"
+      ".inst 0x4e80a42c  // smmla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bc  // smmla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xb0]\n"
+      ".inst 0x4e86a429  // smmla v9.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a471  // smmla v17.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4b9  // smmla v25.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xc0]\n"
+      ".inst 0x4e80a42d  // smmla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bd  // smmla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xd0]\n"
+      ".inst 0x4e86a42a  // smmla v10.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a472  // smmla v18.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4ba  // smmla v26.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xe0]\n"
+      ".inst 0x4e80a42e  // smmla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4be  // smmla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x4e86a42b  // smmla v11.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a473  // smmla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bb  // smmla v27.4s, v5.16b, v6.16b\n"
       "ldr q7, [x9, #0x0]\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e80a42f  // smmla v15.4s, v1.16b, v0.16b\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
       "ldr q3, [x24, #0x0]\n"
-      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      ".inst 0x4e80a4bf  // smmla v31.4s, v5.16b, v0.16b\n"
       "ldr q5, [x22, #0x0]\n"
       "bge 123b\n"
       "124:"  // Height 5: Multiply loop: Single iteration only
@@ -2208,86 +2207,86 @@
       ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
       ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
       ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x90]\n"
+      "ldr q0, [x9, #0x90]\n"
       ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
       ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
       ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x9, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x9, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x9, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x9, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x9, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x9, #0xf0]\n"
+      "ldr q2, [x9, #0xa0]\n"
+      ".inst 0x4e80a42c  // smmla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bc  // smmla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xb0]\n"
+      ".inst 0x4e82a429  // smmla v9.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a471  // smmla v17.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4b9  // smmla v25.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x9, #0xc0]\n"
+      ".inst 0x4e80a42d  // smmla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bd  // smmla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xd0]\n"
+      ".inst 0x4e82a42a  // smmla v10.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a472  // smmla v18.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4ba  // smmla v26.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x9, #0xe0]\n"
+      ".inst 0x4e80a42e  // smmla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4be  // smmla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      ".inst 0x4e82a42b  // smmla v11.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a473  // smmla v19.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4bb  // smmla v27.4s, v5.16b, v2.16b\n"
+      ".inst 0x4e80a42f  // smmla v15.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bf  // smmla v31.4s, v5.16b, v0.16b\n"
       "125:"  // Height 5: Multiply loop: Main loop skip
       "cbz x27, 132f\n"
       "cmp x27, #0x8\n"
       "blt 127f\n"
       "126:"  // Height 5: Multiply loop: Odd block loop
       "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
       "sub x27, x27, #0x8\n"
-      "ldr d5, [x22], #0x8\n"
-      "ldr q6, [x9, #0x0]\n"
-      "trn1 v4.2d, v5.2d, v7.2d\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a498  // smmla v24.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
+      "ldr d0, [x22], #0x8\n"
+      "ldr q1, [x9, #0x0]\n"
+      "trn1 v2.2d, v0.2d, v2.2d\n"
+      ".inst 0x4e81a488  // smmla v8.4s, v4.16b, v1.16b\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x4e81a470  // smmla v16.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a458  // smmla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x20]\n"
+      ".inst 0x4e80a48c  // smmla v12.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
       "cmp x27, #0x8\n"
-      ".inst 0x4e87a49c  // smmla v28.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a499  // smmla v25.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49d  // smmla v29.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49a  // smmla v26.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49e  // smmla v30.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x9, #0x70]\n"
-      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e80a45c  // smmla v28.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x30]\n"
+      ".inst 0x4e81a489  // smmla v9.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a471  // smmla v17.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a459  // smmla v25.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x40]\n"
+      ".inst 0x4e80a48d  // smmla v13.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45d  // smmla v29.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x50]\n"
+      ".inst 0x4e81a48a  // smmla v10.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a472  // smmla v18.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45a  // smmla v26.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x60]\n"
+      ".inst 0x4e80a48e  // smmla v14.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45e  // smmla v30.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x70]\n"
+      ".inst 0x4e81a48b  // smmla v11.4s, v4.16b, v1.16b\n"
       "add x9, x9, #0x80\n"
-      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49b  // smmla v27.4s, v4.16b, v6.16b\n"
-      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49f  // smmla v31.4s, v4.16b, v7.16b\n"
+      ".inst 0x4e81a473  // smmla v19.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45b  // smmla v27.4s, v2.16b, v1.16b\n"
+      ".inst 0x4e80a48f  // smmla v15.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45f  // smmla v31.4s, v2.16b, v0.16b\n"
       "bge 126b\n"
       "127:"  // Height 5: Multiply loop: Skip odd blocks
       "cbz x27, 132f\n"
@@ -2340,74 +2339,74 @@
       "ldr b5, [x22, #0x0]\n"
       "131:"  // Height 5: Multiply loop: Ragged operand read: Done
       "ldr q7, [x9, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x9, #0x10]\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x9, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x9, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x70]\n"
+      "trn1 v6.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      "trn1 v2.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x4e87a4c8  // smmla v8.4s, v6.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
+      "ldr q1, [x9, #0x20]\n"
+      ".inst 0x4e80a4cc  // smmla v12.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45c  // smmla v28.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x30]\n"
+      ".inst 0x4e81a4c9  // smmla v9.4s, v6.16b, v1.16b\n"
+      ".inst 0x4e81a471  // smmla v17.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a459  // smmla v25.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x40]\n"
+      ".inst 0x4e80a4cd  // smmla v13.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45d  // smmla v29.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x50]\n"
+      ".inst 0x4e81a4ca  // smmla v10.4s, v6.16b, v1.16b\n"
+      ".inst 0x4e81a472  // smmla v18.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45a  // smmla v26.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x60]\n"
+      ".inst 0x4e80a4ce  // smmla v14.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45e  // smmla v30.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x70]\n"
       "add x9, x9, #0x80\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      ".inst 0x4e81a4cb  // smmla v11.4s, v6.16b, v1.16b\n"
+      ".inst 0x4e81a473  // smmla v19.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45b  // smmla v27.4s, v2.16b, v1.16b\n"
+      ".inst 0x4e80a4cf  // smmla v15.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45f  // smmla v31.4s, v2.16b, v0.16b\n"
       "132:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 120b\n"
-      "ldr q0, [x14, #0x0]\n"
-      "ldr q1, [x14, #0x10]\n"
-      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr q4, [x14, #0x0]\n"
+      "ldr q3, [x14, #0x10]\n"
+      "uzp1 v2.2d, v8.2d, v12.2d\n"
       "uzp2 v8.2d, v8.2d, v12.2d\n"
-      "ldr q2, [x14, #0x20]\n"
-      "ldr q3, [x14, #0x30]\n"
+      "ldr q1, [x14, #0x20]\n"
+      "ldr q0, [x14, #0x30]\n"
       "uzp1 v12.2d, v9.2d, v13.2d\n"
       "uzp2 v9.2d, v9.2d, v13.2d\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x11, x20\n"
+      "add x25, x11, x20\n"
       "uzp1 v13.2d, v10.2d, v14.2d\n"
       "uzp2 v10.2d, v10.2d, v14.2d\n"
       "uzp1 v14.2d, v11.2d, v15.2d\n"
       "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "add x24, x25, x20\n"
       "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
       "uzp1 v15.2d, v16.2d, v20.2d\n"
       "uzp2 v16.2d, v16.2d, v20.2d\n"
-      "add x21, x22, x20\n"
+      "add x22, x23, x20\n"
       "prfm pstl1keep, [x11, #0x0]\n"
       "uzp1 v20.2d, v17.2d, v21.2d\n"
       "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
       "uzp1 v21.2d, v18.2d, v22.2d\n"
       "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "uzp1 v22.2d, v19.2d, v23.2d\n"
       "uzp2 v19.2d, v19.2d, v23.2d\n"
       "add x14, x14, #0x40\n"
@@ -2415,27 +2414,27 @@
       "uzp1 v25.2d, v25.2d, v29.2d\n"
       "uzp1 v26.2d, v26.2d, v30.2d\n"
       "uzp1 v27.2d, v27.2d, v31.2d\n"
-      "mov v31.16b, v7.16b\n"
-      "add v31.4s, v31.4s, v0.4s\n"
-      "add v12.4s, v12.4s, v1.4s\n"
-      "add v13.4s, v13.4s, v2.4s\n"
-      "add v14.4s, v14.4s, v3.4s\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "add v15.4s, v15.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v1.4s\n"
-      "add v21.4s, v21.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v3.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
+      "mov v31.16b, v2.16b\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v3.4s\n"
+      "add v13.4s, v13.4s, v1.4s\n"
+      "add v14.4s, v14.4s, v0.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v3.4s\n"
+      "add v10.4s, v10.4s, v1.4s\n"
+      "add v11.4s, v11.4s, v0.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v3.4s\n"
+      "add v18.4s, v18.4s, v1.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v3.4s\n"
+      "add v26.4s, v26.4s, v1.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
       "tbz %x[flags], #4, 133f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -2449,10 +2448,10 @@
       "add x13, x13, #0x40\n"
       "b 134f\n"
       "133:"  // Height 5: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -2481,79 +2480,79 @@
       "sqrdmulh v26.4s, v26.4s, v6.4s\n"
       "sqrdmulh v27.4s, v27.4s, v7.4s\n"
       "tbz %x[flags], #5, 135f\n"
-      "and v4.16b, v31.16b, v0.16b\n"
-      "and v5.16b, v12.16b, v1.16b\n"
-      "and v6.16b, v13.16b, v2.16b\n"
-      "and v7.16b, v14.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v31.4s, v31.4s, v4.4s\n"
-      "sqadd v12.4s, v12.4s, v5.4s\n"
-      "sqadd v13.4s, v13.4s, v6.4s\n"
-      "sqadd v14.4s, v14.4s, v7.4s\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v15.16b, v0.16b\n"
-      "and v5.16b, v20.16b, v1.16b\n"
-      "and v6.16b, v21.16b, v2.16b\n"
-      "and v7.16b, v22.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v15.4s, v15.4s, v4.4s\n"
-      "sqadd v20.4s, v20.4s, v5.4s\n"
-      "sqadd v21.4s, v21.4s, v6.4s\n"
-      "sqadd v22.4s, v22.4s, v7.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "and v4.16b, v24.16b, v0.16b\n"
-      "and v5.16b, v25.16b, v1.16b\n"
-      "and v6.16b, v26.16b, v2.16b\n"
-      "and v7.16b, v27.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v4.4s\n"
-      "sqadd v25.4s, v25.4s, v5.4s\n"
-      "sqadd v26.4s, v26.4s, v6.4s\n"
-      "sqadd v27.4s, v27.4s, v7.4s\n"
+      "and v30.16b, v31.16b, v0.16b\n"
+      "and v29.16b, v12.16b, v1.16b\n"
+      "and v28.16b, v13.16b, v2.16b\n"
+      "and v23.16b, v14.16b, v3.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v30.4s\n"
+      "sqadd v12.4s, v12.4s, v29.4s\n"
+      "sqadd v13.4s, v13.4s, v28.4s\n"
+      "sqadd v14.4s, v14.4s, v23.4s\n"
+      "and v30.16b, v8.16b, v0.16b\n"
+      "and v29.16b, v9.16b, v1.16b\n"
+      "and v28.16b, v10.16b, v2.16b\n"
+      "and v23.16b, v11.16b, v3.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v30.4s\n"
+      "sqadd v9.4s, v9.4s, v29.4s\n"
+      "sqadd v10.4s, v10.4s, v28.4s\n"
+      "sqadd v11.4s, v11.4s, v23.4s\n"
+      "and v30.16b, v15.16b, v0.16b\n"
+      "and v29.16b, v20.16b, v1.16b\n"
+      "and v28.16b, v21.16b, v2.16b\n"
+      "and v23.16b, v22.16b, v3.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v30.4s\n"
+      "sqadd v20.4s, v20.4s, v29.4s\n"
+      "sqadd v21.4s, v21.4s, v28.4s\n"
+      "sqadd v22.4s, v22.4s, v23.4s\n"
+      "and v30.16b, v16.16b, v0.16b\n"
+      "and v29.16b, v17.16b, v1.16b\n"
+      "and v28.16b, v18.16b, v2.16b\n"
+      "and v23.16b, v19.16b, v3.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v30.4s\n"
+      "sqadd v17.4s, v17.4s, v29.4s\n"
+      "sqadd v18.4s, v18.4s, v28.4s\n"
+      "sqadd v19.4s, v19.4s, v23.4s\n"
+      "and v30.16b, v24.16b, v0.16b\n"
+      "and v29.16b, v25.16b, v1.16b\n"
+      "and v28.16b, v26.16b, v2.16b\n"
+      "and v23.16b, v27.16b, v3.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v30.4s\n"
+      "sqadd v25.4s, v25.4s, v29.4s\n"
+      "sqadd v26.4s, v26.4s, v28.4s\n"
+      "sqadd v27.4s, v27.4s, v23.4s\n"
       "135:"  // Height 5: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v29.4s }, [x20]\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
       "srshl v12.4s, v12.4s, v1.4s\n"
       "srshl v13.4s, v13.4s, v2.4s\n"
       "srshl v14.4s, v14.4s, v3.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v23.4s }, [x20]\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
       "cmp x10, #0x10\n"
@@ -2569,194 +2568,194 @@
       "srshl v25.4s, v25.4s, v1.4s\n"
       "srshl v26.4s, v26.4s, v2.4s\n"
       "srshl v27.4s, v27.4s, v3.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v31.4s, v31.4s, v29.4s\n"
+      "add v12.4s, v12.4s, v29.4s\n"
+      "add v13.4s, v13.4s, v29.4s\n"
+      "add v14.4s, v14.4s, v29.4s\n"
+      "add v8.4s, v8.4s, v29.4s\n"
+      "add v9.4s, v9.4s, v29.4s\n"
+      "add v10.4s, v10.4s, v29.4s\n"
+      "add v11.4s, v11.4s, v29.4s\n"
+      "add v15.4s, v15.4s, v29.4s\n"
+      "add v20.4s, v20.4s, v29.4s\n"
+      "add v21.4s, v21.4s, v29.4s\n"
+      "add v22.4s, v22.4s, v29.4s\n"
+      "add v16.4s, v16.4s, v29.4s\n"
+      "add v17.4s, v17.4s, v29.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v29.4s\n"
+      "add v24.4s, v24.4s, v29.4s\n"
+      "add v25.4s, v25.4s, v29.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v29.4s\n"
+      "smin v31.4s, v31.4s, v28.4s\n"
+      "smin v12.4s, v12.4s, v28.4s\n"
+      "smin v13.4s, v13.4s, v28.4s\n"
+      "smin v14.4s, v14.4s, v28.4s\n"
+      "smin v8.4s, v8.4s, v28.4s\n"
+      "smin v9.4s, v9.4s, v28.4s\n"
+      "smin v10.4s, v10.4s, v28.4s\n"
+      "smin v11.4s, v11.4s, v28.4s\n"
+      "smin v15.4s, v15.4s, v28.4s\n"
+      "smin v20.4s, v20.4s, v28.4s\n"
+      "smin v21.4s, v21.4s, v28.4s\n"
+      "smin v22.4s, v22.4s, v28.4s\n"
+      "smin v16.4s, v16.4s, v28.4s\n"
+      "smin v17.4s, v17.4s, v28.4s\n"
+      "smin v18.4s, v18.4s, v28.4s\n"
+      "smin v19.4s, v19.4s, v28.4s\n"
+      "smin v24.4s, v24.4s, v28.4s\n"
+      "smin v25.4s, v25.4s, v28.4s\n"
+      "smin v26.4s, v26.4s, v28.4s\n"
+      "smin v27.4s, v27.4s, v28.4s\n"
+      "smax v31.4s, v31.4s, v23.4s\n"
+      "smax v12.4s, v12.4s, v23.4s\n"
+      "smax v13.4s, v13.4s, v23.4s\n"
+      "smax v14.4s, v14.4s, v23.4s\n"
+      "smax v8.4s, v8.4s, v23.4s\n"
+      "smax v9.4s, v9.4s, v23.4s\n"
+      "smax v10.4s, v10.4s, v23.4s\n"
+      "smax v11.4s, v11.4s, v23.4s\n"
+      "smax v15.4s, v15.4s, v23.4s\n"
+      "smax v20.4s, v20.4s, v23.4s\n"
+      "smax v21.4s, v21.4s, v23.4s\n"
+      "smax v22.4s, v22.4s, v23.4s\n"
+      "smax v16.4s, v16.4s, v23.4s\n"
+      "smax v17.4s, v17.4s, v23.4s\n"
+      "smax v18.4s, v18.4s, v23.4s\n"
+      "smax v19.4s, v19.4s, v23.4s\n"
+      "smax v24.4s, v24.4s, v23.4s\n"
+      "smax v25.4s, v25.4s, v23.4s\n"
+      "smax v26.4s, v26.4s, v23.4s\n"
+      "smax v27.4s, v27.4s, v23.4s\n"
       "uzp1 v31.8h, v31.8h, v12.8h\n"
-      "uzp1 v12.8h, v13.8h, v14.8h\n"
+      "uzp1 v28.8h, v13.8h, v14.8h\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v23.8h, v10.8h, v11.8h\n"
       "uzp1 v15.8h, v15.8h, v20.8h\n"
       "uzp1 v20.8h, v21.8h, v22.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
-      "uzp1 v31.16b, v31.16b, v12.16b\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "uzp1 v31.16b, v31.16b, v28.16b\n"
+      "uzp1 v8.16b, v8.16b, v23.16b\n"
       "uzp1 v15.16b, v15.16b, v20.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 144f\n"
       "tbz x10, #3, 139f\n"
       "str d31, [x11], #0x8\n"
-      "str d8, [x24], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
       "tbz x10, #2, 137f\n"
       "st1 { v31.s }[2], [x11], #0x4\n"
-      "st1 { v8.s }[2], [x24], #0x4\n"
-      "st1 { v15.s }[2], [x23], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
+      "st1 { v8.s }[2], [x25], #0x4\n"
+      "st1 { v15.s }[2], [x24], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
       "tbz x10, #1, 136f\n"
       "st1 { v31.h }[6], [x11], #0x2\n"
-      "st1 { v8.h }[6], [x24], #0x2\n"
-      "st1 { v15.h }[6], [x23], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
+      "st1 { v8.h }[6], [x25], #0x2\n"
+      "st1 { v15.h }[6], [x24], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
       "tbz x10, #0, 143f\n"
       "st1 { v31.b }[14], [x11]\n"
-      "st1 { v8.b }[14], [x24]\n"
-      "st1 { v15.b }[14], [x23]\n"
-      "st1 { v16.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
+      "st1 { v8.b }[14], [x25]\n"
+      "st1 { v15.b }[14], [x24]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 143f\n"
       "136:"  // Height 5: Partial direct writeback: partial_1_12
       "tbz x10, #0, 143f\n"
       "st1 { v31.b }[12], [x11]\n"
-      "st1 { v8.b }[12], [x24]\n"
-      "st1 { v15.b }[12], [x23]\n"
-      "st1 { v16.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
+      "st1 { v8.b }[12], [x25]\n"
+      "st1 { v15.b }[12], [x24]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 143f\n"
       "137:"  // Height 5: Partial direct writeback: partial_2_8
       "tbz x10, #1, 138f\n"
       "st1 { v31.h }[4], [x11], #0x2\n"
-      "st1 { v8.h }[4], [x24], #0x2\n"
-      "st1 { v15.h }[4], [x23], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
+      "st1 { v8.h }[4], [x25], #0x2\n"
+      "st1 { v15.h }[4], [x24], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
       "tbz x10, #0, 143f\n"
       "st1 { v31.b }[10], [x11]\n"
-      "st1 { v8.b }[10], [x24]\n"
-      "st1 { v15.b }[10], [x23]\n"
-      "st1 { v16.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
+      "st1 { v8.b }[10], [x25]\n"
+      "st1 { v15.b }[10], [x24]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 143f\n"
       "138:"  // Height 5: Partial direct writeback: partial_1_8
       "tbz x10, #0, 143f\n"
       "st1 { v31.b }[8], [x11]\n"
-      "st1 { v8.b }[8], [x24]\n"
-      "st1 { v15.b }[8], [x23]\n"
-      "st1 { v16.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
+      "st1 { v8.b }[8], [x25]\n"
+      "st1 { v15.b }[8], [x24]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 143f\n"
       "139:"  // Height 5: Partial direct writeback: partial_4_0
       "tbz x10, #2, 141f\n"
       "str s31, [x11], #0x4\n"
-      "str s8, [x24], #0x4\n"
-      "str s15, [x23], #0x4\n"
-      "str s16, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
+      "str s8, [x25], #0x4\n"
+      "str s15, [x24], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
       "tbz x10, #1, 140f\n"
       "st1 { v31.h }[2], [x11], #0x2\n"
-      "st1 { v8.h }[2], [x24], #0x2\n"
-      "st1 { v15.h }[2], [x23], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
+      "st1 { v8.h }[2], [x25], #0x2\n"
+      "st1 { v15.h }[2], [x24], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
       "tbz x10, #0, 143f\n"
       "st1 { v31.b }[6], [x11]\n"
-      "st1 { v8.b }[6], [x24]\n"
-      "st1 { v15.b }[6], [x23]\n"
-      "st1 { v16.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
+      "st1 { v8.b }[6], [x25]\n"
+      "st1 { v15.b }[6], [x24]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 143f\n"
       "140:"  // Height 5: Partial direct writeback: partial_1_4
       "tbz x10, #0, 143f\n"
       "st1 { v31.b }[4], [x11]\n"
-      "st1 { v8.b }[4], [x24]\n"
-      "st1 { v15.b }[4], [x23]\n"
-      "st1 { v16.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
+      "st1 { v8.b }[4], [x25]\n"
+      "st1 { v15.b }[4], [x24]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 143f\n"
       "141:"  // Height 5: Partial direct writeback: partial_2_0
       "tbz x10, #1, 142f\n"
       "str h31, [x11], #0x2\n"
-      "str h8, [x24], #0x2\n"
-      "str h15, [x23], #0x2\n"
-      "str h16, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
+      "str h8, [x25], #0x2\n"
+      "str h15, [x24], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
       "tbz x10, #0, 143f\n"
       "st1 { v31.b }[2], [x11]\n"
-      "st1 { v8.b }[2], [x24]\n"
-      "st1 { v15.b }[2], [x23]\n"
-      "st1 { v16.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
+      "st1 { v8.b }[2], [x25]\n"
+      "st1 { v15.b }[2], [x24]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 143f\n"
       "142:"  // Height 5: Partial direct writeback: partial_1_0
       "str b31, [x11, #0x0]\n"
-      "str b8, [x24, #0x0]\n"
-      "str b15, [x23, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
+      "str b8, [x25, #0x0]\n"
+      "str b15, [x24, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "143:"  // Height 5: Partial direct writeback: Done
       "b 145f\n"
       "144:"  // Height 5: Full writeback
       "str q31, [x11, #0x0]\n"
       "add x11, x11, #0x10\n"
-      "str q8, [x24, #0x0]\n"
-      "str q15, [x23, #0x0]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
+      "str q8, [x25, #0x0]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "145:"  // Height 5: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 118b\n"
@@ -2801,16 +2800,16 @@
       "149:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 150f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 151f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -2822,11 +2821,11 @@
       "b 151f\n"
       "150:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "151:"  // Height 6: input setup done
       "cmp x27, #0x10\n"
       "blt 154f\n"
@@ -2893,42 +2892,42 @@
       "ldr q2, [x25, #0x0]\n"
       "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x90]\n"
+      "ldr q0, [x9, #0x90]\n"
       "ldr q4, [x23, #0x0]\n"
       ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
       ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
       ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x9, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x9, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x9, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x9, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x9, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x9, #0xf0]\n"
+      "ldr q6, [x9, #0xa0]\n"
+      ".inst 0x4e80a42c  // smmla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bc  // smmla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xb0]\n"
+      ".inst 0x4e86a429  // smmla v9.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a471  // smmla v17.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4b9  // smmla v25.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xc0]\n"
+      ".inst 0x4e80a42d  // smmla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bd  // smmla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xd0]\n"
+      ".inst 0x4e86a42a  // smmla v10.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a472  // smmla v18.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4ba  // smmla v26.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xe0]\n"
+      ".inst 0x4e80a42e  // smmla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4be  // smmla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x4e86a42b  // smmla v11.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a473  // smmla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bb  // smmla v27.4s, v5.16b, v6.16b\n"
       "ldr q7, [x9, #0x0]\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e80a42f  // smmla v15.4s, v1.16b, v0.16b\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
       "ldr q3, [x24, #0x0]\n"
-      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      ".inst 0x4e80a4bf  // smmla v31.4s, v5.16b, v0.16b\n"
       "ldr q5, [x22, #0x0]\n"
       "ldr q6, [x21, #0x0]\n"
       "bge 152b\n"
@@ -2984,87 +2983,87 @@
       ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
       ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
       ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x90]\n"
+      "ldr q0, [x9, #0x90]\n"
       ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
       ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
       ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x9, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x9, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x9, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x9, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x9, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x9, #0xf0]\n"
+      "ldr q2, [x9, #0xa0]\n"
+      ".inst 0x4e80a42c  // smmla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bc  // smmla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xb0]\n"
+      ".inst 0x4e82a429  // smmla v9.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a471  // smmla v17.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4b9  // smmla v25.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x9, #0xc0]\n"
+      ".inst 0x4e80a42d  // smmla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bd  // smmla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xd0]\n"
+      ".inst 0x4e82a42a  // smmla v10.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a472  // smmla v18.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4ba  // smmla v26.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x9, #0xe0]\n"
+      ".inst 0x4e80a42e  // smmla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4be  // smmla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x9, #0xf0]\n"
       "add x9, x9, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      ".inst 0x4e82a42b  // smmla v11.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a473  // smmla v19.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4bb  // smmla v27.4s, v5.16b, v2.16b\n"
+      ".inst 0x4e80a42f  // smmla v15.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bf  // smmla v31.4s, v5.16b, v0.16b\n"
       "154:"  // Height 6: Multiply loop: Main loop skip
       "cbz x27, 161f\n"
       "cmp x27, #0x8\n"
       "blt 156f\n"
       "155:"  // Height 6: Multiply loop: Odd block loop
       "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
       "sub x27, x27, #0x8\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
       "cmp x27, #0x8\n"
-      "ldr d5, [x22], #0x8\n"
-      "ldr d7, [x21], #0x8\n"
-      "trn1 v4.2d, v5.2d, v7.2d\n"
-      "ldr q6, [x9, #0x0]\n"
-      "ldr q7, [x9, #0x10]\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a498  // smmla v24.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x20]\n"
-      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49c  // smmla v28.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x9, #0x30]\n"
-      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a499  // smmla v25.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x40]\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49d  // smmla v29.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x9, #0x50]\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49a  // smmla v26.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x60]\n"
-      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49e  // smmla v30.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x9, #0x70]\n"
+      "ldr d1, [x22], #0x8\n"
+      "ldr d0, [x21], #0x8\n"
+      "trn1 v2.2d, v1.2d, v0.2d\n"
+      "ldr q1, [x9, #0x0]\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x4e81a488  // smmla v8.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a470  // smmla v16.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a458  // smmla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x20]\n"
+      ".inst 0x4e80a48c  // smmla v12.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45c  // smmla v28.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x30]\n"
+      ".inst 0x4e81a489  // smmla v9.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a471  // smmla v17.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a459  // smmla v25.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x40]\n"
+      ".inst 0x4e80a48d  // smmla v13.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45d  // smmla v29.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x50]\n"
+      ".inst 0x4e81a48a  // smmla v10.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a472  // smmla v18.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45a  // smmla v26.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x9, #0x60]\n"
+      ".inst 0x4e80a48e  // smmla v14.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45e  // smmla v30.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x9, #0x70]\n"
       "add x9, x9, #0x80\n"
-      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49b  // smmla v27.4s, v4.16b, v6.16b\n"
-      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49f  // smmla v31.4s, v4.16b, v7.16b\n"
+      ".inst 0x4e81a48b  // smmla v11.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a473  // smmla v19.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45b  // smmla v27.4s, v2.16b, v1.16b\n"
+      ".inst 0x4e80a48f  // smmla v15.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45f  // smmla v31.4s, v2.16b, v0.16b\n"
       "bge 155b\n"
       "156:"  // Height 6: Multiply loop: Skip odd blocks
       "cbz x27, 161f\n"
@@ -3124,77 +3123,77 @@
       "ldr b6, [x21, #0x0]\n"
       "160:"  // Height 6: Multiply loop: Ragged operand read: Done
       "ldr q7, [x9, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x9, #0x10]\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x9, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x9, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x9, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x9, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "trn1 v2.2d, v1.2d, v2.2d\n"
+      "trn1 v4.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a448  // smmla v8.4s, v2.16b, v7.16b\n"
+      "trn1 v3.2d, v5.2d, v6.2d\n"
+      "ldr q0, [x9, #0x10]\n"
+      ".inst 0x4e87a490  // smmla v16.4s, v4.16b, v7.16b\n"
+      ".inst 0x4e87a478  // smmla v24.4s, v3.16b, v7.16b\n"
+      "ldr q1, [x9, #0x20]\n"
+      ".inst 0x4e80a44c  // smmla v12.4s, v2.16b, v0.16b\n"
+      ".inst 0x4e80a494  // smmla v20.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a47c  // smmla v28.4s, v3.16b, v0.16b\n"
+      "ldr q0, [x9, #0x30]\n"
+      ".inst 0x4e81a449  // smmla v9.4s, v2.16b, v1.16b\n"
+      ".inst 0x4e81a491  // smmla v17.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a479  // smmla v25.4s, v3.16b, v1.16b\n"
+      "ldr q1, [x9, #0x40]\n"
+      ".inst 0x4e80a44d  // smmla v13.4s, v2.16b, v0.16b\n"
+      ".inst 0x4e80a495  // smmla v21.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a47d  // smmla v29.4s, v3.16b, v0.16b\n"
+      "ldr q0, [x9, #0x50]\n"
+      ".inst 0x4e81a44a  // smmla v10.4s, v2.16b, v1.16b\n"
+      ".inst 0x4e81a492  // smmla v18.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a47a  // smmla v26.4s, v3.16b, v1.16b\n"
+      "ldr q1, [x9, #0x60]\n"
+      ".inst 0x4e80a44e  // smmla v14.4s, v2.16b, v0.16b\n"
+      ".inst 0x4e80a496  // smmla v22.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a47e  // smmla v30.4s, v3.16b, v0.16b\n"
+      "ldr q0, [x9, #0x70]\n"
+      ".inst 0x4e81a44b  // smmla v11.4s, v2.16b, v1.16b\n"
       "add x9, x9, #0x80\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      ".inst 0x4e81a493  // smmla v19.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a47b  // smmla v27.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e80a44f  // smmla v15.4s, v2.16b, v0.16b\n"
+      ".inst 0x4e80a497  // smmla v23.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a47f  // smmla v31.4s, v3.16b, v0.16b\n"
       "161:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 149b\n"
-      "ldr q0, [x14, #0x0]\n"
-      "ldr q1, [x14, #0x10]\n"
-      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr q4, [x14, #0x0]\n"
+      "ldr q3, [x14, #0x10]\n"
+      "uzp1 v2.2d, v8.2d, v12.2d\n"
       "uzp2 v8.2d, v8.2d, v12.2d\n"
-      "ldr q2, [x14, #0x20]\n"
-      "ldr q3, [x14, #0x30]\n"
+      "ldr q1, [x14, #0x20]\n"
+      "ldr q0, [x14, #0x30]\n"
       "uzp1 v12.2d, v9.2d, v13.2d\n"
       "uzp2 v9.2d, v9.2d, v13.2d\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x11, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x11, x20\n"
+      "add x24, x25, x20\n"
       "uzp1 v13.2d, v10.2d, v14.2d\n"
       "uzp2 v10.2d, v10.2d, v14.2d\n"
       "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "add x23, x24, x20\n"
       "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
       "uzp2 v11.2d, v11.2d, v15.2d\n"
       "uzp1 v15.2d, v16.2d, v20.2d\n"
-      "add x20, x21, x20\n"
+      "add x21, x22, x20\n"
       "prfm pstl1keep, [x11, #0x0]\n"
       "uzp2 v16.2d, v16.2d, v20.2d\n"
       "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
       "prfm pstl1keep, [x24, #0x0]\n"
-      "prfm pstl1keep, [x23, #0x0]\n"
       "uzp2 v17.2d, v17.2d, v21.2d\n"
       "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "uzp2 v18.2d, v18.2d, v22.2d\n"
       "uzp1 v22.2d, v19.2d, v23.2d\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
       "add x14, x14, #0x40\n"
       "uzp2 v19.2d, v19.2d, v23.2d\n"
       "uzp1 v23.2d, v24.2d, v28.2d\n"
@@ -3205,31 +3204,31 @@
       "uzp2 v26.2d, v26.2d, v30.2d\n"
       "uzp1 v30.2d, v27.2d, v31.2d\n"
       "uzp2 v27.2d, v27.2d, v31.2d\n"
-      "mov v31.16b, v7.16b\n"
-      "add v31.4s, v31.4s, v0.4s\n"
-      "add v12.4s, v12.4s, v1.4s\n"
-      "add v13.4s, v13.4s, v2.4s\n"
-      "add v14.4s, v14.4s, v3.4s\n"
-      "add v8.4s, v8.4s, v0.4s\n"
-      "add v9.4s, v9.4s, v1.4s\n"
-      "add v10.4s, v10.4s, v2.4s\n"
-      "add v11.4s, v11.4s, v3.4s\n"
-      "add v15.4s, v15.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v1.4s\n"
-      "add v21.4s, v21.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v3.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v23.4s, v23.4s, v0.4s\n"
-      "add v28.4s, v28.4s, v1.4s\n"
-      "add v29.4s, v29.4s, v2.4s\n"
-      "add v30.4s, v30.4s, v3.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
+      "mov v31.16b, v2.16b\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v3.4s\n"
+      "add v13.4s, v13.4s, v1.4s\n"
+      "add v14.4s, v14.4s, v0.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v3.4s\n"
+      "add v10.4s, v10.4s, v1.4s\n"
+      "add v11.4s, v11.4s, v0.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v3.4s\n"
+      "add v18.4s, v18.4s, v1.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v1.4s\n"
+      "add v30.4s, v30.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v3.4s\n"
+      "add v26.4s, v26.4s, v1.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
       "tbz %x[flags], #4, 162f\n"
       "ldr q0, [x12, #0x0]\n"
       "ldr q4, [x13, #0x0]\n"
@@ -3243,10 +3242,10 @@
       "add x13, x13, #0x40\n"
       "b 163f\n"
       "162:"  // Height 6: per layer parameters
-      "add x25, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x25]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "mov v1.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
       "mov v2.16b, v0.16b\n"
@@ -3279,91 +3278,91 @@
       "sqrdmulh v26.4s, v26.4s, v6.4s\n"
       "sqrdmulh v27.4s, v27.4s, v7.4s\n"
       "tbz %x[flags], #5, 164f\n"
-      "and v4.16b, v31.16b, v0.16b\n"
-      "and v5.16b, v12.16b, v1.16b\n"
-      "and v6.16b, v13.16b, v2.16b\n"
-      "and v7.16b, v14.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v7.16b, v31.16b, v0.16b\n"
+      "and v6.16b, v12.16b, v1.16b\n"
+      "and v5.16b, v13.16b, v2.16b\n"
+      "and v4.16b, v14.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v31.4s, v31.4s, v4.4s\n"
-      "sqadd v12.4s, v12.4s, v5.4s\n"
-      "sqadd v13.4s, v13.4s, v6.4s\n"
-      "sqadd v14.4s, v14.4s, v7.4s\n"
-      "and v4.16b, v8.16b, v0.16b\n"
-      "and v5.16b, v9.16b, v1.16b\n"
-      "and v6.16b, v10.16b, v2.16b\n"
-      "and v7.16b, v11.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sqadd v10.4s, v10.4s, v6.4s\n"
-      "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v4.16b, v15.16b, v0.16b\n"
-      "and v5.16b, v20.16b, v1.16b\n"
-      "and v6.16b, v21.16b, v2.16b\n"
-      "and v7.16b, v22.16b, v3.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v15.4s, v15.4s, v4.4s\n"
-      "sqadd v20.4s, v20.4s, v5.4s\n"
-      "sqadd v21.4s, v21.4s, v6.4s\n"
-      "sqadd v22.4s, v22.4s, v7.4s\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "and v7.16b, v19.16b, v3.16b\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v7.4s\n"
+      "sqadd v12.4s, v12.4s, v6.4s\n"
+      "sqadd v13.4s, v13.4s, v5.4s\n"
+      "sqadd v14.4s, v14.4s, v4.4s\n"
+      "and v7.16b, v8.16b, v0.16b\n"
+      "and v6.16b, v9.16b, v1.16b\n"
+      "and v5.16b, v10.16b, v2.16b\n"
+      "and v4.16b, v11.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "and v5.16b, v28.16b, v1.16b\n"
-      "and v6.16b, v29.16b, v2.16b\n"
-      "and v7.16b, v30.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v7.4s\n"
+      "sqadd v9.4s, v9.4s, v6.4s\n"
+      "sqadd v10.4s, v10.4s, v5.4s\n"
+      "sqadd v11.4s, v11.4s, v4.4s\n"
+      "and v7.16b, v15.16b, v0.16b\n"
+      "and v6.16b, v20.16b, v1.16b\n"
+      "and v5.16b, v21.16b, v2.16b\n"
+      "and v4.16b, v22.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sqadd v28.4s, v28.4s, v5.4s\n"
-      "sqadd v29.4s, v29.4s, v6.4s\n"
-      "sqadd v30.4s, v30.4s, v7.4s\n"
-      "and v4.16b, v24.16b, v0.16b\n"
-      "and v5.16b, v25.16b, v1.16b\n"
-      "and v6.16b, v26.16b, v2.16b\n"
-      "and v7.16b, v27.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "sqadd v20.4s, v20.4s, v6.4s\n"
+      "sqadd v21.4s, v21.4s, v5.4s\n"
+      "sqadd v22.4s, v22.4s, v4.4s\n"
+      "and v7.16b, v16.16b, v0.16b\n"
+      "and v6.16b, v17.16b, v1.16b\n"
+      "and v5.16b, v18.16b, v2.16b\n"
+      "and v4.16b, v19.16b, v3.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v24.4s, v24.4s, v4.4s\n"
-      "sqadd v25.4s, v25.4s, v5.4s\n"
-      "sqadd v26.4s, v26.4s, v6.4s\n"
-      "sqadd v27.4s, v27.4s, v7.4s\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v7.4s\n"
+      "sqadd v17.4s, v17.4s, v6.4s\n"
+      "sqadd v18.4s, v18.4s, v5.4s\n"
+      "sqadd v19.4s, v19.4s, v4.4s\n"
+      "and v7.16b, v23.16b, v0.16b\n"
+      "and v6.16b, v28.16b, v1.16b\n"
+      "and v5.16b, v29.16b, v2.16b\n"
+      "and v4.16b, v30.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v7.4s\n"
+      "sqadd v28.4s, v28.4s, v6.4s\n"
+      "sqadd v29.4s, v29.4s, v5.4s\n"
+      "sqadd v30.4s, v30.4s, v4.4s\n"
+      "and v7.16b, v24.16b, v0.16b\n"
+      "and v6.16b, v25.16b, v1.16b\n"
+      "and v5.16b, v26.16b, v2.16b\n"
+      "and v4.16b, v27.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v7.4s\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sqadd v26.4s, v26.4s, v5.4s\n"
+      "sqadd v27.4s, v27.4s, v4.4s\n"
       "164:"  // Height 6: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v6.4s }, [x20]\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
       "srshl v12.4s, v12.4s, v1.4s\n"
       "srshl v13.4s, v13.4s, v2.4s\n"
       "srshl v14.4s, v14.4s, v3.4s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x25]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v5.4s }, [x20]\n"
       "srshl v8.4s, v8.4s, v0.4s\n"
       "srshl v9.4s, v9.4s, v1.4s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x25]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v4.4s }, [x20]\n"
       "srshl v10.4s, v10.4s, v2.4s\n"
       "srshl v11.4s, v11.4s, v3.4s\n"
       "cmp x10, #0x10\n"
@@ -3383,225 +3382,225 @@
       "srshl v25.4s, v25.4s, v1.4s\n"
       "srshl v26.4s, v26.4s, v2.4s\n"
       "srshl v27.4s, v27.4s, v3.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
-      "add v12.4s, v12.4s, v4.4s\n"
-      "add v13.4s, v13.4s, v4.4s\n"
-      "add v14.4s, v14.4s, v4.4s\n"
-      "add v8.4s, v8.4s, v4.4s\n"
-      "add v9.4s, v9.4s, v4.4s\n"
-      "add v10.4s, v10.4s, v4.4s\n"
-      "add v11.4s, v11.4s, v4.4s\n"
-      "add v15.4s, v15.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "smin v12.4s, v12.4s, v6.4s\n"
-      "smin v13.4s, v13.4s, v6.4s\n"
-      "smin v14.4s, v14.4s, v6.4s\n"
-      "smin v8.4s, v8.4s, v6.4s\n"
-      "smin v9.4s, v9.4s, v6.4s\n"
-      "smin v10.4s, v10.4s, v6.4s\n"
-      "smin v11.4s, v11.4s, v6.4s\n"
-      "smin v15.4s, v15.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
-      "smax v12.4s, v12.4s, v5.4s\n"
-      "smax v13.4s, v13.4s, v5.4s\n"
-      "smax v14.4s, v14.4s, v5.4s\n"
-      "smax v8.4s, v8.4s, v5.4s\n"
-      "smax v9.4s, v9.4s, v5.4s\n"
-      "smax v10.4s, v10.4s, v5.4s\n"
-      "smax v11.4s, v11.4s, v5.4s\n"
-      "smax v15.4s, v15.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v31.4s, v31.4s, v6.4s\n"
+      "add v12.4s, v12.4s, v6.4s\n"
+      "add v13.4s, v13.4s, v6.4s\n"
+      "add v14.4s, v14.4s, v6.4s\n"
+      "add v8.4s, v8.4s, v6.4s\n"
+      "add v9.4s, v9.4s, v6.4s\n"
+      "add v10.4s, v10.4s, v6.4s\n"
+      "add v11.4s, v11.4s, v6.4s\n"
+      "add v15.4s, v15.4s, v6.4s\n"
+      "add v20.4s, v20.4s, v6.4s\n"
+      "add v21.4s, v21.4s, v6.4s\n"
+      "add v22.4s, v22.4s, v6.4s\n"
+      "add v16.4s, v16.4s, v6.4s\n"
+      "add v17.4s, v17.4s, v6.4s\n"
+      "add v18.4s, v18.4s, v6.4s\n"
+      "add v19.4s, v19.4s, v6.4s\n"
+      "add v23.4s, v23.4s, v6.4s\n"
+      "add v28.4s, v28.4s, v6.4s\n"
+      "add v29.4s, v29.4s, v6.4s\n"
+      "add v30.4s, v30.4s, v6.4s\n"
+      "add v24.4s, v24.4s, v6.4s\n"
+      "add v25.4s, v25.4s, v6.4s\n"
+      "add v26.4s, v26.4s, v6.4s\n"
+      "add v27.4s, v27.4s, v6.4s\n"
+      "smin v31.4s, v31.4s, v5.4s\n"
+      "smin v12.4s, v12.4s, v5.4s\n"
+      "smin v13.4s, v13.4s, v5.4s\n"
+      "smin v14.4s, v14.4s, v5.4s\n"
+      "smin v8.4s, v8.4s, v5.4s\n"
+      "smin v9.4s, v9.4s, v5.4s\n"
+      "smin v10.4s, v10.4s, v5.4s\n"
+      "smin v11.4s, v11.4s, v5.4s\n"
+      "smin v15.4s, v15.4s, v5.4s\n"
+      "smin v20.4s, v20.4s, v5.4s\n"
+      "smin v21.4s, v21.4s, v5.4s\n"
+      "smin v22.4s, v22.4s, v5.4s\n"
+      "smin v16.4s, v16.4s, v5.4s\n"
+      "smin v17.4s, v17.4s, v5.4s\n"
+      "smin v18.4s, v18.4s, v5.4s\n"
+      "smin v19.4s, v19.4s, v5.4s\n"
+      "smin v23.4s, v23.4s, v5.4s\n"
+      "smin v28.4s, v28.4s, v5.4s\n"
+      "smin v29.4s, v29.4s, v5.4s\n"
+      "smin v30.4s, v30.4s, v5.4s\n"
+      "smin v24.4s, v24.4s, v5.4s\n"
+      "smin v25.4s, v25.4s, v5.4s\n"
+      "smin v26.4s, v26.4s, v5.4s\n"
+      "smin v27.4s, v27.4s, v5.4s\n"
+      "smax v31.4s, v31.4s, v4.4s\n"
+      "smax v12.4s, v12.4s, v4.4s\n"
+      "smax v13.4s, v13.4s, v4.4s\n"
+      "smax v14.4s, v14.4s, v4.4s\n"
+      "smax v8.4s, v8.4s, v4.4s\n"
+      "smax v9.4s, v9.4s, v4.4s\n"
+      "smax v10.4s, v10.4s, v4.4s\n"
+      "smax v11.4s, v11.4s, v4.4s\n"
+      "smax v15.4s, v15.4s, v4.4s\n"
+      "smax v20.4s, v20.4s, v4.4s\n"
+      "smax v21.4s, v21.4s, v4.4s\n"
+      "smax v22.4s, v22.4s, v4.4s\n"
+      "smax v16.4s, v16.4s, v4.4s\n"
+      "smax v17.4s, v17.4s, v4.4s\n"
+      "smax v18.4s, v18.4s, v4.4s\n"
+      "smax v19.4s, v19.4s, v4.4s\n"
+      "smax v23.4s, v23.4s, v4.4s\n"
+      "smax v28.4s, v28.4s, v4.4s\n"
+      "smax v29.4s, v29.4s, v4.4s\n"
+      "smax v30.4s, v30.4s, v4.4s\n"
+      "smax v24.4s, v24.4s, v4.4s\n"
+      "smax v25.4s, v25.4s, v4.4s\n"
+      "smax v26.4s, v26.4s, v4.4s\n"
+      "smax v27.4s, v27.4s, v4.4s\n"
       "uzp1 v31.8h, v31.8h, v12.8h\n"
-      "uzp1 v12.8h, v13.8h, v14.8h\n"
+      "uzp1 v1.8h, v13.8h, v14.8h\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v0.8h, v10.8h, v11.8h\n"
       "uzp1 v15.8h, v15.8h, v20.8h\n"
       "uzp1 v20.8h, v21.8h, v22.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v23.8h, v23.8h, v28.8h\n"
-      "uzp1 v28.8h, v29.8h, v30.8h\n"
+      "uzp1 v18.8h, v29.8h, v30.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
-      "uzp1 v31.16b, v31.16b, v12.16b\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "uzp1 v31.16b, v31.16b, v1.16b\n"
+      "uzp1 v8.16b, v8.16b, v0.16b\n"
       "uzp1 v15.16b, v15.16b, v20.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v23.16b, v23.16b, v28.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v23.16b, v23.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 173f\n"
       "tbz x10, #3, 168f\n"
       "str d31, [x11], #0x8\n"
-      "str d8, [x24], #0x8\n"
-      "str d15, [x23], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
+      "str d8, [x25], #0x8\n"
+      "str d15, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
       "tbz x10, #2, 166f\n"
       "st1 { v31.s }[2], [x11], #0x4\n"
-      "st1 { v8.s }[2], [x24], #0x4\n"
-      "st1 { v15.s }[2], [x23], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
-      "st1 { v23.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
+      "st1 { v8.s }[2], [x25], #0x4\n"
+      "st1 { v15.s }[2], [x24], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v23.s }[2], [x22], #0x4\n"
+      "st1 { v24.s }[2], [x21], #0x4\n"
       "tbz x10, #1, 165f\n"
       "st1 { v31.h }[6], [x11], #0x2\n"
-      "st1 { v8.h }[6], [x24], #0x2\n"
-      "st1 { v15.h }[6], [x23], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
-      "st1 { v23.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
+      "st1 { v8.h }[6], [x25], #0x2\n"
+      "st1 { v15.h }[6], [x24], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v23.h }[6], [x22], #0x2\n"
+      "st1 { v24.h }[6], [x21], #0x2\n"
       "tbz x10, #0, 172f\n"
       "st1 { v31.b }[14], [x11]\n"
-      "st1 { v8.b }[14], [x24]\n"
-      "st1 { v15.b }[14], [x23]\n"
-      "st1 { v16.b }[14], [x22]\n"
-      "st1 { v23.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
+      "st1 { v8.b }[14], [x25]\n"
+      "st1 { v15.b }[14], [x24]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v23.b }[14], [x22]\n"
+      "st1 { v24.b }[14], [x21]\n"
       "b 172f\n"
       "165:"  // Height 6: Partial direct writeback: partial_1_12
       "tbz x10, #0, 172f\n"
       "st1 { v31.b }[12], [x11]\n"
-      "st1 { v8.b }[12], [x24]\n"
-      "st1 { v15.b }[12], [x23]\n"
-      "st1 { v16.b }[12], [x22]\n"
-      "st1 { v23.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
+      "st1 { v8.b }[12], [x25]\n"
+      "st1 { v15.b }[12], [x24]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v23.b }[12], [x22]\n"
+      "st1 { v24.b }[12], [x21]\n"
       "b 172f\n"
       "166:"  // Height 6: Partial direct writeback: partial_2_8
       "tbz x10, #1, 167f\n"
       "st1 { v31.h }[4], [x11], #0x2\n"
-      "st1 { v8.h }[4], [x24], #0x2\n"
-      "st1 { v15.h }[4], [x23], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
-      "st1 { v23.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
+      "st1 { v8.h }[4], [x25], #0x2\n"
+      "st1 { v15.h }[4], [x24], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v23.h }[4], [x22], #0x2\n"
+      "st1 { v24.h }[4], [x21], #0x2\n"
       "tbz x10, #0, 172f\n"
       "st1 { v31.b }[10], [x11]\n"
-      "st1 { v8.b }[10], [x24]\n"
-      "st1 { v15.b }[10], [x23]\n"
-      "st1 { v16.b }[10], [x22]\n"
-      "st1 { v23.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
+      "st1 { v8.b }[10], [x25]\n"
+      "st1 { v15.b }[10], [x24]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v23.b }[10], [x22]\n"
+      "st1 { v24.b }[10], [x21]\n"
       "b 172f\n"
       "167:"  // Height 6: Partial direct writeback: partial_1_8
       "tbz x10, #0, 172f\n"
       "st1 { v31.b }[8], [x11]\n"
-      "st1 { v8.b }[8], [x24]\n"
-      "st1 { v15.b }[8], [x23]\n"
-      "st1 { v16.b }[8], [x22]\n"
-      "st1 { v23.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
+      "st1 { v8.b }[8], [x25]\n"
+      "st1 { v15.b }[8], [x24]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v23.b }[8], [x22]\n"
+      "st1 { v24.b }[8], [x21]\n"
       "b 172f\n"
       "168:"  // Height 6: Partial direct writeback: partial_4_0
       "tbz x10, #2, 170f\n"
       "str s31, [x11], #0x4\n"
-      "str s8, [x24], #0x4\n"
-      "str s15, [x23], #0x4\n"
-      "str s16, [x22], #0x4\n"
-      "str s23, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
+      "str s8, [x25], #0x4\n"
+      "str s15, [x24], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "str s23, [x22], #0x4\n"
+      "str s24, [x21], #0x4\n"
       "tbz x10, #1, 169f\n"
       "st1 { v31.h }[2], [x11], #0x2\n"
-      "st1 { v8.h }[2], [x24], #0x2\n"
-      "st1 { v15.h }[2], [x23], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
-      "st1 { v23.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
+      "st1 { v8.h }[2], [x25], #0x2\n"
+      "st1 { v15.h }[2], [x24], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v23.h }[2], [x22], #0x2\n"
+      "st1 { v24.h }[2], [x21], #0x2\n"
       "tbz x10, #0, 172f\n"
       "st1 { v31.b }[6], [x11]\n"
-      "st1 { v8.b }[6], [x24]\n"
-      "st1 { v15.b }[6], [x23]\n"
-      "st1 { v16.b }[6], [x22]\n"
-      "st1 { v23.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
+      "st1 { v8.b }[6], [x25]\n"
+      "st1 { v15.b }[6], [x24]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v23.b }[6], [x22]\n"
+      "st1 { v24.b }[6], [x21]\n"
       "b 172f\n"
       "169:"  // Height 6: Partial direct writeback: partial_1_4
       "tbz x10, #0, 172f\n"
       "st1 { v31.b }[4], [x11]\n"
-      "st1 { v8.b }[4], [x24]\n"
-      "st1 { v15.b }[4], [x23]\n"
-      "st1 { v16.b }[4], [x22]\n"
-      "st1 { v23.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
+      "st1 { v8.b }[4], [x25]\n"
+      "st1 { v15.b }[4], [x24]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v23.b }[4], [x22]\n"
+      "st1 { v24.b }[4], [x21]\n"
       "b 172f\n"
       "170:"  // Height 6: Partial direct writeback: partial_2_0
       "tbz x10, #1, 171f\n"
       "str h31, [x11], #0x2\n"
-      "str h8, [x24], #0x2\n"
-      "str h15, [x23], #0x2\n"
-      "str h16, [x22], #0x2\n"
-      "str h23, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
+      "str h8, [x25], #0x2\n"
+      "str h15, [x24], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "str h23, [x22], #0x2\n"
+      "str h24, [x21], #0x2\n"
       "tbz x10, #0, 172f\n"
       "st1 { v31.b }[2], [x11]\n"
-      "st1 { v8.b }[2], [x24]\n"
-      "st1 { v15.b }[2], [x23]\n"
-      "st1 { v16.b }[2], [x22]\n"
-      "st1 { v23.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
+      "st1 { v8.b }[2], [x25]\n"
+      "st1 { v15.b }[2], [x24]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v23.b }[2], [x22]\n"
+      "st1 { v24.b }[2], [x21]\n"
       "b 172f\n"
       "171:"  // Height 6: Partial direct writeback: partial_1_0
       "str b31, [x11, #0x0]\n"
-      "str b8, [x24, #0x0]\n"
-      "str b15, [x23, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
-      "str b23, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
+      "str b8, [x25, #0x0]\n"
+      "str b15, [x24, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "str b23, [x22, #0x0]\n"
+      "str b24, [x21, #0x0]\n"
       "172:"  // Height 6: Partial direct writeback: Done
       "b 174f\n"
       "173:"  // Height 6: Full writeback
       "str q31, [x11, #0x0]\n"
       "add x11, x11, #0x10\n"
-      "str q8, [x24, #0x0]\n"
-      "str q15, [x23, #0x0]\n"
-      "str q16, [x22, #0x0]\n"
-      "str q23, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
+      "str q8, [x25, #0x0]\n"
+      "str q15, [x24, #0x0]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q23, [x22, #0x0]\n"
+      "str q24, [x21, #0x0]\n"
       "174:"  // Height 6: Writeback done
       "subs x10, x10, #0x10\n"
       "bgt 147b\n"
@@ -3617,7 +3616,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "176:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
index 48ce676..a02fbe8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -79,12 +79,12 @@
             switch (ci->get_cpu_model()) {
                 default:
                     return { 31.65 };
-                case CPUModel::A55r1:
-                    return { 9.217 };
                 case CPUModel::A510:
                     return { 15.87 };
                 case CPUModel::V1:
                     return { 54.50 };
+                case CPUModel::A55r1:
+                    return { 9.217 };
             }
         }
 
@@ -97,7 +97,7 @@
                 case CPUModel::A510:
                     return { 16.66, 3.92, 0.48 };
                 case CPUModel::V1:
-                    return { 55.40, 19.21, 0.93 };
+                    return { 42.62, 16.32, 0.83 };
             }
         }
 
@@ -121,5 +121,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp
index 8046b2e..289d38c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp
@@ -77,7 +77,6 @@
     ka.N = N;
     ka.B_ptr = B_ptr;
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 171f\n"
@@ -165,11 +164,11 @@
       "14:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 15f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
       "cbnz x15, 16f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20\n"
@@ -186,129 +185,129 @@
       "blt 18f\n"
       "17:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
-      "ldr x12, [x16, #0x28]\n"
+      "ldr d17, [x16, #0x20]\n"
+      "ldr x20, [x16, #0x28]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x38]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      "ldr x12, [x16, #0x48]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x58]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      "ldr x12, [x16, #0x68]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr d7, [x16, #0x70]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x78]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      "ldr x12, [x16, #0x88]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x98]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      "ldr x12, [x16, #0xa8]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0xb8]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      "ldr x12, [x16, #0xc8]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0xd8]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      "ldr x12, [x16, #0xe8]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0xf8]\n"
-      "mov v7.d[1], x11\n"
+      "ldr d16, [x16, #0x30]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x38]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr d17, [x16, #0x40]\n"
+      "ldr x20, [x16, #0x48]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr d16, [x16, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr d17, [x16, #0x60]\n"
+      "ldr x20, [x16, #0x68]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr d16, [x16, #0x70]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x78]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr d17, [x16, #0x80]\n"
+      "ldr x20, [x16, #0x88]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr d16, [x16, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr d17, [x16, #0xa0]\n"
+      "ldr x20, [x16, #0xa8]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr d16, [x16, #0xb0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xb8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr d17, [x16, #0xc0]\n"
+      "ldr x20, [x16, #0xc8]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr d16, [x16, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr d17, [x16, #0xe0]\n"
+      "ldr x20, [x16, #0xe8]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr d16, [x16, #0xf0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xf8]\n"
+      "mov v16.d[1], x20\n"
       "add x13, x13, #0x10\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      "ldr x12, [x16, #0x8]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      "ldr x20, [x16, #0x8]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "ldr d0, [x13, #0x0]\n"
       "sub x14, x14, #0x10\n"
       "ldr d7, [x16, #0x10]\n"
       "cmp x14, #0x20\n"
-      "ldr x10, [x13, #0x8]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x18]\n"
-      "mov v0.d[1], x10\n"
-      "mov v7.d[1], x11\n"
+      "ldr x21, [x13, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x16, #0x18]\n"
+      "mov v0.d[1], x21\n"
+      "mov v7.d[1], x20\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       "bge 17b\n"
       "18:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q17, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x16, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x16, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x16, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x16, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x16, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x16, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x16, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x16, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x16, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x16, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x16, #0xf0]\n"
       "add x13, x13, #0x10\n"
       "sub x14, x14, #0x10\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
       "19:"  // Height 1: Multiply loop: Main loop skip
       "cbz x14, 24f\n"
       "cmp x14, #0x4\n"
       "blt 21f\n"
       "20:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s18, [x13], #0x4\n"
       "sub x14, x14, #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q16, [x16, #0x0]\n"
+      ".inst 0x4f92e208  // sdot v8.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x4f92e209  // sdot v9.4s, v16.16b, v18.4b[0]\n"
+      "ldr q17, [x16, #0x20]\n"
       "cmp x14, #0x4\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x4f92e22a  // sdot v10.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x4f92e20b  // sdot v11.4s, v16.16b, v18.4b[0]\n"
       "add x16, x16, #0x40\n"
       "bge 20b\n"
       "21:"  // Height 1: Multiply loop: Skip odd blocks
@@ -321,14 +320,14 @@
       "22:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x13, #0x0]\n"
       "23:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x0]\n"
+      ".inst 0x4f80e208  // sdot v8.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x20]\n"
+      ".inst 0x4f80e20a  // sdot v10.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
       "24:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -499,226 +498,226 @@
       "48:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 49f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
       "cbnz x15, 50f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20\n"
-      "add x9, x9, x20\n"
+      "add x12, x12, x20\n"
       "b 50f\n"
       "49:"  // Height 2: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20\n"
+      "add x12, x13, x21\n"
       "50:"  // Height 2: input setup done
       "cmp x14, #0x10\n"
       "blt 53f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
       "ldr q7, [x16, #0x10]\n"
       "blt 52f\n"
       "51:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x12, [x16, #0x28]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
+      "ldr d17, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr x12, [x16, #0x48]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x58]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr x12, [x16, #0x68]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr d7, [x16, #0x70]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr x12, [x16, #0x88]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x98]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr x12, [x16, #0xa8]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr x12, [x16, #0xc8]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0xd8]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr x12, [x16, #0xe8]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      "mov v6.d[1], x12\n"
+      "ldr d16, [x16, #0x30]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr d17, [x16, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr x20, [x16, #0x48]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr d16, [x16, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr d17, [x16, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr d16, [x16, #0x70]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr d17, [x16, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr x20, [x16, #0x88]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr d16, [x16, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr d17, [x16, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr d16, [x16, #0xb0]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr d17, [x16, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr x20, [x16, #0xc8]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr d16, [x16, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr d17, [x16, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr d16, [x16, #0xf0]\n"
+      "mov v17.d[1], x21\n"
       "add x13, x13, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "add x9, x9, #0x10\n"
+      "mov v16.d[1], x20\n"
+      "add x12, x12, #0x10\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      "ldr x12, [x16, #0x8]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0x8]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "ldr d0, [x13, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
       "sub x14, x14, #0x10\n"
       "ldr d7, [x16, #0x10]\n"
       "cmp x14, #0x20\n"
-      "ldr x10, [x13, #0x8]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x28, [x9, #0x8]\n"
-      "mov v0.d[1], x10\n"
-      "ldr x11, [x16, #0x18]\n"
-      "mov v1.d[1], x28\n"
+      "ldr x20, [x13, #0x8]\n"
+      "mov v6.d[1], x21\n"
+      "ldr x21, [x12, #0x8]\n"
+      "mov v0.d[1], x20\n"
+      "ldr x20, [x16, #0x18]\n"
+      "mov v1.d[1], x21\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "bge 51b\n"
       "52:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q17, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
       "sub x14, x14, #0x10\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x16, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x16, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x16, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x16, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x16, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x16, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x16, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x16, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x16, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x16, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x16, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x16, #0xf0]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
       "53:"  // Height 2: Multiply loop: Main loop skip
       "cbz x14, 58f\n"
       "cmp x14, #0x4\n"
       "blt 55f\n"
       "54:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s19, [x13], #0x4\n"
       "sub x14, x14, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s18, [x12], #0x4\n"
       "cmp x14, #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q17, [x16, #0x0]\n"
+      ".inst 0x4f93e228  // sdot v8.4s, v17.16b, v19.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x4f92e22c  // sdot v12.4s, v17.16b, v18.4b[0]\n"
+      "ldr q17, [x16, #0x20]\n"
+      ".inst 0x4f93e209  // sdot v9.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20d  // sdot v13.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x4f93e22a  // sdot v10.4s, v17.16b, v19.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f92e22e  // sdot v14.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x4f93e20b  // sdot v11.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20f  // sdot v15.4s, v16.16b, v18.4b[0]\n"
       "bge 54b\n"
       "55:"  // Height 2: Multiply loop: Skip odd blocks
       "cbz x14, 58f\n"
       "tbz x14, #1, 56f\n"
       "ldr h0, [x13], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
       "tbz x14, #0, 57f\n"
       "ld1 { v0.b }[2], [x13]\n"
-      "ld1 { v1.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x12]\n"
       "b 57f\n"
       "56:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x13, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
       "57:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q17, [x16, #0x0]\n"
+      ".inst 0x4f80e228  // sdot v8.4s, v17.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x4f81e22c  // sdot v12.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x16, #0x20]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20d  // sdot v13.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
       "58:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x15, x15, #0x1\n"
@@ -936,281 +935,281 @@
       "82:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 83f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
       "cbnz x15, 84f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20\n"
-      "add x9, x9, x20\n"
-      "add x27, x27, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
       "b 84f\n"
       "83:"  // Height 3: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20\n"
-      "add x27, x9, x20\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
       "84:"  // Height 3: input setup done
       "cmp x14, #0x10\n"
       "blt 87f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
       "ldr q7, [x16, #0x10]\n"
       "blt 86f\n"
       "85:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x12, [x16, #0x28]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
+      "ldr d21, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v21.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x48]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x68]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0x88]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr d7, [x16, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0xa8]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xc8]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xe8]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr d20, [x16, #0x30]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      "ldr x20, [x16, #0x58]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr d21, [x16, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr d20, [x16, #0x50]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr d21, [x16, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0x88]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr d20, [x16, #0x70]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x98]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr d21, [x16, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr d20, [x16, #0x90]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr d21, [x16, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xc8]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr d20, [x16, #0xb0]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xd8]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr d21, [x16, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr d20, [x16, #0xd0]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr d21, [x16, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
       "add x13, x13, #0x10\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      "mov v7.d[1], x11\n"
-      "add x9, x9, #0x10\n"
-      "add x27, x27, #0x10\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr d20, [x16, #0xf0]\n"
+      "mov v20.d[1], x20\n"
+      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr x12, [x16, #0x8]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x10, [x13, #0x8]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
+      "ldr x20, [x16, #0x8]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      "ldr x23, [x13, #0x8]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
       "ldr d0, [x13, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      "ldr x28, [x9, #0x8]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "ldr x22, [x12, #0x8]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
+      "ldr d2, [x11, #0x0]\n"
       "sub x14, x14, #0x10\n"
       "ldr d7, [x16, #0x10]\n"
       "cmp x14, #0x20\n"
-      "ldr x26, [x27, #0x8]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x18]\n"
-      "mov v0.d[1], x10\n"
+      "ldr x21, [x11, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x16, #0x18]\n"
+      "mov v0.d[1], x23\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "mov v1.d[1], x28\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "mov v2.d[1], x26\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "mov v7.d[1], x11\n"
+      "mov v1.d[1], x22\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "mov v2.d[1], x21\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "mov v7.d[1], x20\n"
       "bge 85b\n"
       "86:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q21, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
       "sub x14, x14, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q20, [x16, #0x30]\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x16, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x16, #0x50]\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x16, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x16, #0x70]\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x16, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x16, #0x90]\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x16, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x16, #0xb0]\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x16, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x16, #0xd0]\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x16, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x16, #0xf0]\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
       "87:"  // Height 3: Multiply loop: Main loop skip
       "cbz x14, 92f\n"
       "cmp x14, #0x4\n"
       "blt 89f\n"
       "88:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s24, [x13], #0x4\n"
       "sub x14, x14, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s23, [x12], #0x4\n"
       "cmp x14, #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s22, [x11], #0x4\n"
+      "ldr q21, [x16, #0x0]\n"
+      ".inst 0x4f98e2a8  // sdot v8.4s, v21.16b, v24.4b[0]\n"
+      "ldr q20, [x16, #0x10]\n"
+      ".inst 0x4f97e2ac  // sdot v12.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x4f96e2b0  // sdot v16.4s, v21.16b, v22.4b[0]\n"
+      "ldr q21, [x16, #0x20]\n"
+      ".inst 0x4f98e289  // sdot v9.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28d  // sdot v13.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e291  // sdot v17.4s, v20.16b, v22.4b[0]\n"
+      "ldr q20, [x16, #0x30]\n"
+      ".inst 0x4f98e2aa  // sdot v10.4s, v21.16b, v24.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f97e2ae  // sdot v14.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x4f96e2b2  // sdot v18.4s, v21.16b, v22.4b[0]\n"
+      ".inst 0x4f98e28b  // sdot v11.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28f  // sdot v15.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e293  // sdot v19.4s, v20.16b, v22.4b[0]\n"
       "bge 88b\n"
       "89:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x14, 92f\n"
       "tbz x14, #1, 90f\n"
       "ldr h0, [x13], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
       "tbz x14, #0, 91f\n"
       "ld1 { v0.b }[2], [x13]\n"
-      "ld1 { v1.b }[2], [x9]\n"
-      "ld1 { v2.b }[2], [x27]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
       "b 91f\n"
       "90:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x13, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
-      "ldr b2, [x27, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
       "91:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q21, [x16, #0x0]\n"
+      ".inst 0x4f80e2a8  // sdot v8.4s, v21.16b, v0.4b[0]\n"
+      "ldr q20, [x16, #0x10]\n"
+      ".inst 0x4f81e2ac  // sdot v12.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b0  // sdot v16.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x16, #0x20]\n"
+      ".inst 0x4f80e289  // sdot v9.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28d  // sdot v13.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e291  // sdot v17.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x16, #0x30]\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
       "92:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x15, x15, #0x1\n"
@@ -1475,336 +1474,336 @@
       "116:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 117f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
       "cbnz x15, 118f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20\n"
-      "add x9, x9, x20\n"
-      "add x27, x27, x20\n"
-      "add x25, x25, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
       "b 118f\n"
       "117:"  // Height 4: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20\n"
-      "add x27, x9, x20\n"
-      "add x25, x27, x20\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
       "118:"  // Height 4: input setup done
       "cmp x14, #0x10\n"
       "blt 121f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
       "ldr q7, [x16, #0x10]\n"
       "blt 120f\n"
       "119:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x12, [x16, #0x28]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
+      "ldr d25, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v25.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x48]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x68]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "add x25, x25, #0x10\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr x10, [x13, #0x8]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0x88]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr x28, [x9, #0x8]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr d7, [x16, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr x26, [x27, #0x8]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0xa8]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr x24, [x25, #0x8]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr d24, [x16, #0x30]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      "ldr x20, [x16, #0x58]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr d25, [x16, #0x40]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr d24, [x16, #0x50]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      "ldr x25, [x13, #0x8]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr d25, [x16, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0x88]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      "ldr x24, [x12, #0x8]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr d24, [x16, #0x70]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x98]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      "ldr x23, [x11, #0x8]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr d25, [x16, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      "ldr x22, [x10, #0x8]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr d24, [x16, #0x90]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
       "sub x14, x14, #0x10\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xc8]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr d25, [x16, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xc8]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
       "cmp x14, #0x20\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr d24, [x16, #0xb0]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xd8]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xe8]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      "mov v7.d[1], x11\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr d25, [x16, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr d24, [x16, #0xd0]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr d25, [x16, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr d24, [x16, #0xf0]\n"
+      "mov v24.d[1], x20\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr x12, [x16, #0x8]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x16, #0x18]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0x8]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0x18]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
       "ldr d0, [x13, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr d2, [x27, #0x0]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr d3, [x25, #0x0]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
+      "ldr d3, [x10, #0x0]\n"
       "ldr d7, [x16, #0x10]\n"
-      "mov v6.d[1], x12\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
-      "mov v3.d[1], x24\n"
-      "mov v7.d[1], x11\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x25\n"
+      "mov v1.d[1], x24\n"
+      "mov v2.d[1], x23\n"
+      "mov v3.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 119b\n"
       "120:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q25, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
       "sub x14, x14, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q24, [x16, #0x30]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x16, #0x40]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x16, #0x50]\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x16, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x16, #0x70]\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x16, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x16, #0x90]\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x16, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x16, #0xb0]\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x16, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x16, #0xd0]\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x16, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x16, #0xf0]\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
       "121:"  // Height 4: Multiply loop: Main loop skip
       "cbz x14, 126f\n"
       "cmp x14, #0x4\n"
       "blt 123f\n"
       "122:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s29, [x13], #0x4\n"
       "sub x14, x14, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s28, [x12], #0x4\n"
       "cmp x14, #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s27, [x11], #0x4\n"
+      "ldr s26, [x10], #0x4\n"
+      "ldr q25, [x16, #0x0]\n"
+      ".inst 0x4f9de328  // sdot v8.4s, v25.16b, v29.4b[0]\n"
+      "ldr q24, [x16, #0x10]\n"
+      ".inst 0x4f9ce32c  // sdot v12.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be330  // sdot v16.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae334  // sdot v20.4s, v25.16b, v26.4b[0]\n"
+      "ldr q25, [x16, #0x20]\n"
+      ".inst 0x4f9de309  // sdot v9.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30d  // sdot v13.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be311  // sdot v17.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae315  // sdot v21.4s, v24.16b, v26.4b[0]\n"
+      "ldr q24, [x16, #0x30]\n"
+      ".inst 0x4f9de32a  // sdot v10.4s, v25.16b, v29.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f9ce32e  // sdot v14.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be332  // sdot v18.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae336  // sdot v22.4s, v25.16b, v26.4b[0]\n"
+      ".inst 0x4f9de30b  // sdot v11.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30f  // sdot v15.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be313  // sdot v19.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae317  // sdot v23.4s, v24.16b, v26.4b[0]\n"
       "bge 122b\n"
       "123:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x14, 126f\n"
       "tbz x14, #1, 124f\n"
       "ldr h0, [x13], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
-      "ldr h3, [x25], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "ldr h3, [x10], #0x2\n"
       "tbz x14, #0, 125f\n"
       "ld1 { v0.b }[2], [x13]\n"
-      "ld1 { v1.b }[2], [x9]\n"
-      "ld1 { v2.b }[2], [x27]\n"
-      "ld1 { v3.b }[2], [x25]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
+      "ld1 { v3.b }[2], [x10]\n"
       "b 125f\n"
       "124:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x13, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
-      "ldr b2, [x27, #0x0]\n"
-      "ldr b3, [x25, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
+      "ldr b3, [x10, #0x0]\n"
       "125:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q25, [x16, #0x0]\n"
+      ".inst 0x4f80e328  // sdot v8.4s, v25.16b, v0.4b[0]\n"
+      "ldr q24, [x16, #0x10]\n"
+      ".inst 0x4f81e32c  // sdot v12.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e330  // sdot v16.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e334  // sdot v20.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x16, #0x20]\n"
+      ".inst 0x4f80e309  // sdot v9.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30d  // sdot v13.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e311  // sdot v17.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e315  // sdot v21.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x16, #0x30]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
       "126:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x15, x15, #0x1\n"
@@ -2116,391 +2115,391 @@
       "150:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 151f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x23, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
       "cbnz x15, 152f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
       "add x9, x9, x20\n"
-      "add x27, x27, x20\n"
-      "add x25, x25, x20\n"
-      "add x23, x23, x20\n"
       "b 152f\n"
       "151:"  // Height 5: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20\n"
-      "add x27, x9, x20\n"
-      "add x25, x27, x20\n"
-      "add x23, x25, x20\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
+      "add x9, x10, x21\n"
       "152:"  // Height 5: input setup done
       "cmp x14, #0x10\n"
       "blt 155f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x23, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
       "ldr q7, [x16, #0x10]\n"
       "blt 154f\n"
       "153:"  // Height 5: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x12, [x16, #0x28]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
+      "ldr d29, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v29.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x48]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr x10, [x13, #0x8]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x68]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr x28, [x9, #0x8]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr x26, [x27, #0x8]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr x24, [x25, #0x8]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr x22, [x23, #0x8]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0x88]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr d28, [x16, #0x30]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      "ldr x20, [x16, #0x58]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      "ldr x26, [x13, #0x8]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr d29, [x16, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      "ldr x25, [x12, #0x8]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr d28, [x16, #0x50]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      "ldr x23, [x10, #0x8]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      "ldr x22, [x9, #0x8]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr d29, [x16, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0x88]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
       "sub x14, x14, #0x10\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
       "cmp x14, #0x20\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr d7, [x16, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr d28, [x16, #0x70]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x98]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr d29, [x16, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr d28, [x16, #0x90]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
       "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0xa8]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xc8]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xe8]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      "mov v7.d[1], x11\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr d29, [x16, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xc8]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr d28, [x16, #0xb0]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xd8]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr d29, [x16, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr d28, [x16, #0xd0]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr d29, [x16, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr d28, [x16, #0xf0]\n"
+      "mov v28.d[1], x20\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr x12, [x16, #0x8]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x16, #0x18]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0x8]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0x18]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
       "ldr d0, [x13, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr d2, [x27, #0x0]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr d3, [x25, #0x0]\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
-      "ldr d4, [x23, #0x0]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
+      "ldr d3, [x10, #0x0]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
+      "ldr d4, [x9, #0x0]\n"
       "ldr d7, [x16, #0x10]\n"
-      "mov v6.d[1], x12\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
-      "mov v3.d[1], x24\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x26\n"
+      "mov v1.d[1], x25\n"
+      "mov v2.d[1], x24\n"
+      "mov v3.d[1], x23\n"
       "mov v4.d[1], x22\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "bge 153b\n"
       "154:"  // Height 5: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q29, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
       "sub x14, x14, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q28, [x16, #0x30]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x16, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x16, #0x50]\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x16, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x16, #0x70]\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x16, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x16, #0x90]\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x16, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x16, #0xb0]\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x16, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x16, #0xd0]\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x16, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x16, #0xf0]\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
       "155:"  // Height 5: Multiply loop: Main loop skip
       "cbz x14, 160f\n"
       "cmp x14, #0x4\n"
       "blt 157f\n"
       "156:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s2, [x13], #0x4\n"
       "sub x14, x14, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
       "cmp x14, #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x23], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s0, [x11], #0x4\n"
+      "ldr s31, [x10], #0x4\n"
+      "ldr s30, [x9], #0x4\n"
+      "ldr q29, [x16, #0x0]\n"
+      ".inst 0x4f82e3a8  // sdot v8.4s, v29.16b, v2.4b[0]\n"
+      "ldr q28, [x16, #0x10]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3b0  // sdot v16.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b4  // sdot v20.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3b8  // sdot v24.4s, v29.16b, v30.4b[0]\n"
+      "ldr q29, [x16, #0x20]\n"
+      ".inst 0x4f82e389  // sdot v9.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e391  // sdot v17.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe395  // sdot v21.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee399  // sdot v25.4s, v28.16b, v30.4b[0]\n"
+      "ldr q28, [x16, #0x30]\n"
+      ".inst 0x4f82e3aa  // sdot v10.4s, v29.16b, v2.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b6  // sdot v22.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3ba  // sdot v26.4s, v29.16b, v30.4b[0]\n"
+      ".inst 0x4f82e38b  // sdot v11.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe397  // sdot v23.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee39b  // sdot v27.4s, v28.16b, v30.4b[0]\n"
       "bge 156b\n"
       "157:"  // Height 5: Multiply loop: Skip odd blocks
       "cbz x14, 160f\n"
       "tbz x14, #1, 158f\n"
       "ldr h0, [x13], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
-      "ldr h3, [x25], #0x2\n"
-      "ldr h4, [x23], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "ldr h3, [x10], #0x2\n"
+      "ldr h4, [x9], #0x2\n"
       "tbz x14, #0, 159f\n"
       "ld1 { v0.b }[2], [x13]\n"
-      "ld1 { v1.b }[2], [x9]\n"
-      "ld1 { v2.b }[2], [x27]\n"
-      "ld1 { v3.b }[2], [x25]\n"
-      "ld1 { v4.b }[2], [x23]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
+      "ld1 { v3.b }[2], [x10]\n"
+      "ld1 { v4.b }[2], [x9]\n"
       "b 159f\n"
       "158:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x13, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
-      "ldr b2, [x27, #0x0]\n"
-      "ldr b3, [x25, #0x0]\n"
-      "ldr b4, [x23, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
+      "ldr b3, [x10, #0x0]\n"
+      "ldr b4, [x9, #0x0]\n"
       "159:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q29, [x16, #0x0]\n"
+      ".inst 0x4f80e3a8  // sdot v8.4s, v29.16b, v0.4b[0]\n"
+      "ldr q28, [x16, #0x10]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b0  // sdot v16.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b4  // sdot v20.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3b8  // sdot v24.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x16, #0x20]\n"
+      ".inst 0x4f80e389  // sdot v9.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e391  // sdot v17.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e395  // sdot v21.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e399  // sdot v25.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x16, #0x30]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
       "160:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x15, x15, #0x1\n"
@@ -2862,98 +2861,98 @@
       "184:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 185f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x23, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
+      "ldr x28, [x20, #0x28]\n"
       "cbnz x15, 186f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
       "add x9, x9, x20\n"
-      "add x27, x27, x20\n"
-      "add x25, x25, x20\n"
-      "add x23, x23, x20\n"
-      "add x21, x21, x20\n"
+      "add x28, x28, x20\n"
       "b 186f\n"
       "185:"  // Height 6: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20\n"
-      "add x27, x9, x20\n"
-      "add x25, x27, x20\n"
-      "add x23, x25, x20\n"
-      "add x21, x23, x20\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
+      "add x9, x10, x21\n"
+      "add x28, x9, x21\n"
       "186:"  // Height 6: input setup done
       "cmp x14, #0x10\n"
       "blt 189f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x23, #0x0]\n"
-      "ldr q5, [x21, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
       "ldr q7, [x16, #0x10]\n"
       "blt 188f\n"
       "187:"  // Height 6: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x12, [x16, #0x28]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
       "ldr d6, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x48]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "add x21, x21, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
       "ldr d7, [x16, #0x30]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
+      "ldr x20, [x16, #0x58]\n"
       ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr x10, [x13, #0x8]\n"
+      "ldr x27, [x13, #0x8]\n"
       ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr x28, [x9, #0x8]\n"
+      "ldr x26, [x12, #0x8]\n"
       ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr x26, [x27, #0x8]\n"
+      "ldr x25, [x11, #0x8]\n"
       ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
       "ldr d6, [x16, #0x40]\n"
       ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x68]\n"
+      "ldr x21, [x16, #0x68]\n"
       ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr x24, [x25, #0x8]\n"
+      "ldr x24, [x10, #0x8]\n"
       ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr x22, [x23, #0x8]\n"
+      "ldr x23, [x9, #0x8]\n"
       ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr x20, [x21, #0x8]\n"
+      "ldr x22, [x28, #0x8]\n"
       ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
       "ldr d7, [x16, #0x50]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
+      "ldr x20, [x16, #0x78]\n"
       ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
       "sub x14, x14, #0x10\n"
       ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
@@ -2963,96 +2962,96 @@
       ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
       "ldr d6, [x16, #0x60]\n"
       ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0x88]\n"
+      "ldr x21, [x16, #0x88]\n"
       ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
       "ldr d7, [x16, #0x70]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
+      "ldr x20, [x16, #0x98]\n"
       ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
       "ldr d6, [x16, #0x80]\n"
       ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0xa8]\n"
+      "ldr x21, [x16, #0xa8]\n"
       ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
       "ldr d7, [x16, #0x90]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
+      "ldr x20, [x16, #0xb8]\n"
       ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
       "ldr d6, [x16, #0xa0]\n"
       ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xc8]\n"
+      "ldr x21, [x16, #0xc8]\n"
       ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
       "ldr d7, [x16, #0xb0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
+      "ldr x20, [x16, #0xd8]\n"
       ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
       "ldr d6, [x16, #0xc0]\n"
       ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xe8]\n"
+      "ldr x21, [x16, #0xe8]\n"
       ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
       "ldr d7, [x16, #0xd0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
+      "ldr x20, [x16, #0xf8]\n"
       ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
       "ldr d6, [x16, #0xe0]\n"
       ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
       ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
       "ldr d7, [x16, #0xf0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "add x16, x16, #0x100\n"
       ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr x12, [x16, #0x8]\n"
+      "ldr x21, [x16, #0x8]\n"
       ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x16, #0x18]\n"
+      "ldr x20, [x16, #0x18]\n"
       ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
       ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
@@ -3061,56 +3060,56 @@
       ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
       "ldr d0, [x13, #0x0]\n"
       ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d1, [x12, #0x0]\n"
       ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "ldr d2, [x11, #0x0]\n"
       ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr d3, [x25, #0x0]\n"
+      "ldr d3, [x10, #0x0]\n"
       ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
-      "ldr d4, [x23, #0x0]\n"
+      "ldr d4, [x9, #0x0]\n"
       ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
-      "ldr d5, [x21, #0x0]\n"
+      "ldr d5, [x28, #0x0]\n"
       "ldr d7, [x16, #0x10]\n"
-      "mov v6.d[1], x12\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x27\n"
+      "mov v1.d[1], x26\n"
+      "mov v2.d[1], x25\n"
       "mov v3.d[1], x24\n"
-      "mov v4.d[1], x22\n"
-      "mov v5.d[1], x20\n"
-      "mov v7.d[1], x11\n"
+      "mov v4.d[1], x23\n"
+      "mov v5.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 187b\n"
       "188:"  // Height 6: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
       "ldr q6, [x16, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x21, x21, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
       "sub x14, x14, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
       "ldr q7, [x16, #0x30]\n"
       ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
       ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
       ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
@@ -3210,98 +3209,98 @@
       "cmp x14, #0x4\n"
       "blt 191f\n"
       "190:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s7, [x13], #0x4\n"
       "sub x14, x14, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s6, [x12], #0x4\n"
       "cmp x14, #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x23], #0x4\n"
-      "ldr s5, [x21], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s5, [x11], #0x4\n"
+      "ldr s4, [x10], #0x4\n"
+      "ldr s3, [x9], #0x4\n"
+      "ldr s2, [x28], #0x4\n"
+      "ldr q1, [x16, #0x0]\n"
+      ".inst 0x4f87e028  // sdot v8.4s, v1.16b, v7.4b[0]\n"
+      "ldr q0, [x16, #0x10]\n"
+      ".inst 0x4f86e02c  // sdot v12.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e030  // sdot v16.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e034  // sdot v20.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e038  // sdot v24.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03c  // sdot v28.4s, v1.16b, v2.4b[0]\n"
+      "ldr q1, [x16, #0x20]\n"
+      ".inst 0x4f87e009  // sdot v9.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00d  // sdot v13.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e011  // sdot v17.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e015  // sdot v21.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e019  // sdot v25.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01d  // sdot v29.4s, v0.16b, v2.4b[0]\n"
+      "ldr q0, [x16, #0x30]\n"
+      ".inst 0x4f87e02a  // sdot v10.4s, v1.16b, v7.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f86e02e  // sdot v14.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e032  // sdot v18.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e036  // sdot v22.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e03a  // sdot v26.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03e  // sdot v30.4s, v1.16b, v2.4b[0]\n"
+      ".inst 0x4f87e00b  // sdot v11.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00f  // sdot v15.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e013  // sdot v19.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e017  // sdot v23.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e01b  // sdot v27.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01f  // sdot v31.4s, v0.16b, v2.4b[0]\n"
       "bge 190b\n"
       "191:"  // Height 6: Multiply loop: Skip odd blocks
       "cbz x14, 194f\n"
       "tbz x14, #1, 192f\n"
       "ldr h0, [x13], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
-      "ldr h3, [x25], #0x2\n"
-      "ldr h4, [x23], #0x2\n"
-      "ldr h5, [x21], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "ldr h3, [x10], #0x2\n"
+      "ldr h4, [x9], #0x2\n"
+      "ldr h5, [x28], #0x2\n"
       "tbz x14, #0, 193f\n"
       "ld1 { v0.b }[2], [x13]\n"
-      "ld1 { v1.b }[2], [x9]\n"
-      "ld1 { v2.b }[2], [x27]\n"
-      "ld1 { v3.b }[2], [x25]\n"
-      "ld1 { v4.b }[2], [x23]\n"
-      "ld1 { v5.b }[2], [x21]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
+      "ld1 { v3.b }[2], [x10]\n"
+      "ld1 { v4.b }[2], [x9]\n"
+      "ld1 { v5.b }[2], [x28]\n"
       "b 193f\n"
       "192:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x13, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
-      "ldr b2, [x27, #0x0]\n"
-      "ldr b3, [x25, #0x0]\n"
-      "ldr b4, [x23, #0x0]\n"
-      "ldr b5, [x21, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
+      "ldr b3, [x10, #0x0]\n"
+      "ldr b4, [x9, #0x0]\n"
+      "ldr b5, [x28, #0x0]\n"
       "193:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x16, #0x0]\n"
+      ".inst 0x4f80e0e8  // sdot v8.4s, v7.16b, v0.4b[0]\n"
+      "ldr q6, [x16, #0x10]\n"
+      ".inst 0x4f81e0ec  // sdot v12.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f0  // sdot v16.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f4  // sdot v20.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f8  // sdot v24.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fc  // sdot v28.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x16, #0x20]\n"
+      ".inst 0x4f80e0c9  // sdot v9.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cd  // sdot v13.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d1  // sdot v17.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d5  // sdot v21.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d9  // sdot v25.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dd  // sdot v29.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x16, #0x30]\n"
+      ".inst 0x4f80e0ea  // sdot v10.4s, v7.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f81e0ee  // sdot v14.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f2  // sdot v18.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f6  // sdot v22.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fa  // sdot v26.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fe  // sdot v30.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0cb  // sdot v11.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cf  // sdot v15.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d3  // sdot v19.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d7  // sdot v23.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0db  // sdot v27.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0df  // sdot v31.4s, v6.16b, v5.4b[0]\n"
       "194:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x15, x15, #0x1\n"
@@ -3488,7 +3487,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "206:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
index ddf7761..452d647 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
@@ -77,7 +77,6 @@
     ka.N = N;
     ka.B_ptr = B_ptr;
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 171f\n"
@@ -165,11 +164,11 @@
       "14:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 15f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 16f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -186,37 +185,37 @@
       "blt 18f\n"
       "17:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "sub x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
       "cmp x27, #0x20\n"
       "add x10, x10, #0x100\n"
@@ -226,37 +225,37 @@
       "bge 17b\n"
       "18:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x26, x26, #0x10\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "add x10, x10, #0x100\n"
       "19:"  // Height 1: Multiply loop: Main loop skip
@@ -264,17 +263,17 @@
       "cmp x27, #0x4\n"
       "blt 21f\n"
       "20:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr s18, [x26], #0x4\n"
+      "ldr q16, [x10, #0x0]\n"
+      ".inst 0x4f92e208  // sdot v8.4s, v16.16b, v18.4b[0]\n"
       "sub x27, x27, #0x4\n"
-      "ldr q7, [x10, #0x10]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x4f92e209  // sdot v9.4s, v16.16b, v18.4b[0]\n"
       "cmp x27, #0x4\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f92e22a  // sdot v10.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x4f92e20b  // sdot v11.4s, v16.16b, v18.4b[0]\n"
       "add x10, x10, #0x40\n"
       "bge 20b\n"
       "21:"  // Height 1: Multiply loop: Skip odd blocks
@@ -287,14 +286,14 @@
       "22:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x26, #0x0]\n"
       "23:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x4f80e228  // sdot v8.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
       "add x10, x10, #0x40\n"
       "24:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -465,12 +464,12 @@
       "48:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 49f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 50f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -478,7 +477,7 @@
       "b 50f\n"
       "49:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
+      "add x25, x26, x21\n"
       "50:"  // Height 2: input setup done
       "cmp x27, #0x10\n"
       "blt 53f\n"
@@ -491,137 +490,137 @@
       "51:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       "sub x27, x27, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q16, [x10, #0x30]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
       "cmp x27, #0x20\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
       "ldr q1, [x25, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 51b\n"
       "52:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       "add x26, x26, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q16, [x10, #0x30]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x4fa0e228  // sdot v8.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22c  // sdot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4fa0e209  // sdot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20d  // sdot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x4fa0e22a  // sdot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e22e  // sdot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4fa0e20b  // sdot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e20f  // sdot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x4f80ea28  // sdot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2c  // sdot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x4f80ea09  // sdot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0d  // sdot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x4f80ea2a  // sdot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea2e  // sdot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x4f80ea0b  // sdot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea0f  // sdot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x4fa0ea28  // sdot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2c  // sdot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x4fa0ea09  // sdot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0d  // sdot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea2a  // sdot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea2e  // sdot v14.4s, v17.16b, v1.4b[3]\n"
+      ".inst 0x4fa0ea0b  // sdot v11.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea0f  // sdot v15.4s, v16.16b, v1.4b[3]\n"
       "53:"  // Height 2: Multiply loop: Main loop skip
       "cbz x27, 58f\n"
       "cmp x27, #0x4\n"
       "blt 55f\n"
       "54:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s19, [x26], #0x4\n"
+      "ldr s18, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x4f93e228  // sdot v8.4s, v17.16b, v19.4b[0]\n"
+      ".inst 0x4f92e22c  // sdot v12.4s, v17.16b, v18.4b[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x4f93e209  // sdot v9.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20d  // sdot v13.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f93e22a  // sdot v10.4s, v17.16b, v19.4b[0]\n"
+      ".inst 0x4f92e22e  // sdot v14.4s, v17.16b, v18.4b[0]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f93e20b  // sdot v11.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x4f92e20f  // sdot v15.4s, v16.16b, v18.4b[0]\n"
       "bge 54b\n"
       "55:"  // Height 2: Multiply loop: Skip odd blocks
       "cbz x27, 58f\n"
@@ -636,19 +635,19 @@
       "ldr b0, [x26, #0x0]\n"
       "ldr b1, [x25, #0x0]\n"
       "57:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x4f80e228  // sdot v8.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22c  // sdot v12.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x4f80e209  // sdot v9.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20d  // sdot v13.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x4f80e22a  // sdot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x4f81e22e  // sdot v14.4s, v17.16b, v1.4b[0]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f80e20b  // sdot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x4f81e20f  // sdot v15.4s, v16.16b, v1.4b[0]\n"
       "58:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -866,13 +865,13 @@
       "82:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 83f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 84f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -881,8 +880,8 @@
       "b 84f\n"
       "83:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "84:"  // Height 3: input setup done
       "cmp x27, #0x10\n"
       "blt 87f\n"
@@ -899,75 +898,75 @@
       "sub x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q21, [x10, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       "add x25, x25, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
       "cmp x27, #0x20\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x10, #0x50]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
       "ldr q2, [x24, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 85b\n"
@@ -977,98 +976,98 @@
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q21, [x10, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       "add x24, x24, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q20, [x10, #0x30]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      ".inst 0x4fa0e2a8  // sdot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ac  // sdot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b0  // sdot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      ".inst 0x4fa0e289  // sdot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28d  // sdot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e291  // sdot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      ".inst 0x4fa0e2aa  // sdot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e2ae  // sdot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e2b2  // sdot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      ".inst 0x4fa0e28b  // sdot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e28f  // sdot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e293  // sdot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      ".inst 0x4f80eaa8  // sdot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaac  // sdot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab0  // sdot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      ".inst 0x4f80ea89  // sdot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8d  // sdot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea91  // sdot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      ".inst 0x4f80eaaa  // sdot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x4f81eaae  // sdot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x4f82eab2  // sdot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      ".inst 0x4f80ea8b  // sdot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x4f81ea8f  // sdot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x4f82ea93  // sdot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      ".inst 0x4fa0eaa8  // sdot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaac  // sdot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab0  // sdot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      ".inst 0x4fa0ea89  // sdot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8d  // sdot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea91  // sdot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa0eaaa  // sdot v10.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eaae  // sdot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eab2  // sdot v18.4s, v21.16b, v2.4b[3]\n"
+      ".inst 0x4fa0ea8b  // sdot v11.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ea8f  // sdot v15.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ea93  // sdot v19.4s, v20.16b, v2.4b[3]\n"
       "87:"  // Height 3: Multiply loop: Main loop skip
       "cbz x27, 92f\n"
       "cmp x27, #0x4\n"
       "blt 89f\n"
       "88:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr q21, [x10, #0x0]\n"
+      ".inst 0x4f98e2a8  // sdot v8.4s, v21.16b, v24.4b[0]\n"
+      ".inst 0x4f97e2ac  // sdot v12.4s, v21.16b, v23.4b[0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      ".inst 0x4f96e2b0  // sdot v16.4s, v21.16b, v22.4b[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      ".inst 0x4f98e289  // sdot v9.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28d  // sdot v13.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e291  // sdot v17.4s, v20.16b, v22.4b[0]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f98e2aa  // sdot v10.4s, v21.16b, v24.4b[0]\n"
+      ".inst 0x4f97e2ae  // sdot v14.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x4f96e2b2  // sdot v18.4s, v21.16b, v22.4b[0]\n"
+      ".inst 0x4f98e28b  // sdot v11.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x4f97e28f  // sdot v15.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x4f96e293  // sdot v19.4s, v20.16b, v22.4b[0]\n"
       "bge 88b\n"
       "89:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x27, 92f\n"
@@ -1086,23 +1085,23 @@
       "ldr b1, [x25, #0x0]\n"
       "ldr b2, [x24, #0x0]\n"
       "91:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q21, [x10, #0x0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      ".inst 0x4f80e2a8  // sdot v8.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ac  // sdot v12.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b0  // sdot v16.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      ".inst 0x4f80e289  // sdot v9.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28d  // sdot v13.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e291  // sdot v17.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f80e2aa  // sdot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x4f81e2ae  // sdot v14.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x4f82e2b2  // sdot v18.4s, v21.16b, v2.4b[0]\n"
+      ".inst 0x4f80e28b  // sdot v11.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x4f81e28f  // sdot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x4f82e293  // sdot v19.4s, v20.16b, v2.4b[0]\n"
       "92:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1367,14 +1366,14 @@
       "116:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 117f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 118f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1384,9 +1383,9 @@
       "b 118f\n"
       "117:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "118:"  // Height 4: input setup done
       "cmp x27, #0x10\n"
       "blt 121f\n"
@@ -1405,7 +1404,7 @@
       "add x26, x26, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x20]\n"
       "add x25, x25, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1413,85 +1412,85 @@
       "add x23, x23, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q24, [x10, #0x30]\n"
       "cmp x27, #0x20\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x10, #0x40]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
       "ldr q2, [x24, #0x0]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
       "ldr q3, [x23, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 119b\n"
@@ -1502,7 +1501,7 @@
       "add x25, x25, #0x10\n"
       ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
       ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x20]\n"
       "add x24, x24, #0x10\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1510,112 +1509,112 @@
       "sub x27, x27, #0x10\n"
       ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q24, [x10, #0x30]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x10, #0x40]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      ".inst 0x4fa0e328  // sdot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32c  // sdot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e330  // sdot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e334  // sdot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      ".inst 0x4fa0e309  // sdot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30d  // sdot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e311  // sdot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e315  // sdot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      ".inst 0x4fa0e32a  // sdot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e32e  // sdot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e332  // sdot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e336  // sdot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      ".inst 0x4fa0e30b  // sdot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e30f  // sdot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e313  // sdot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e317  // sdot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      ".inst 0x4f80eb28  // sdot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2c  // sdot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb30  // sdot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb34  // sdot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      ".inst 0x4f80eb09  // sdot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0d  // sdot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb11  // sdot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb15  // sdot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      ".inst 0x4f80eb2a  // sdot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb2e  // sdot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb32  // sdot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb36  // sdot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      ".inst 0x4f80eb0b  // sdot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb0f  // sdot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb13  // sdot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb17  // sdot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      ".inst 0x4fa0eb28  // sdot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2c  // sdot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb30  // sdot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb34  // sdot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      ".inst 0x4fa0eb09  // sdot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0d  // sdot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb11  // sdot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb15  // sdot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb2a  // sdot v10.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb2e  // sdot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb32  // sdot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb36  // sdot v22.4s, v25.16b, v3.4b[3]\n"
+      ".inst 0x4fa0eb0b  // sdot v11.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb0f  // sdot v15.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb13  // sdot v19.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb17  // sdot v23.4s, v24.16b, v3.4b[3]\n"
       "121:"  // Height 4: Multiply loop: Main loop skip
       "cbz x27, 126f\n"
       "cmp x27, #0x4\n"
       "blt 123f\n"
       "122:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      ".inst 0x4f9de328  // sdot v8.4s, v25.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce32c  // sdot v12.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be330  // sdot v16.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae334  // sdot v20.4s, v25.16b, v26.4b[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      ".inst 0x4f9de309  // sdot v9.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30d  // sdot v13.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be311  // sdot v17.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae315  // sdot v21.4s, v24.16b, v26.4b[0]\n"
+      "ldr q24, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f9de32a  // sdot v10.4s, v25.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce32e  // sdot v14.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x4f9be332  // sdot v18.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae336  // sdot v22.4s, v25.16b, v26.4b[0]\n"
+      ".inst 0x4f9de30b  // sdot v11.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x4f9ce30f  // sdot v15.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x4f9be313  // sdot v19.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x4f9ae317  // sdot v23.4s, v24.16b, v26.4b[0]\n"
       "bge 122b\n"
       "123:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x27, 126f\n"
@@ -1636,27 +1635,27 @@
       "ldr b2, [x24, #0x0]\n"
       "ldr b3, [x23, #0x0]\n"
       "125:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      ".inst 0x4f80e328  // sdot v8.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32c  // sdot v12.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e330  // sdot v16.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e334  // sdot v20.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      ".inst 0x4f80e309  // sdot v9.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30d  // sdot v13.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e311  // sdot v17.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e315  // sdot v21.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f80e32a  // sdot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x4f81e32e  // sdot v14.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x4f82e332  // sdot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x4f83e336  // sdot v22.4s, v25.16b, v3.4b[0]\n"
+      ".inst 0x4f80e30b  // sdot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x4f81e30f  // sdot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x4f82e313  // sdot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x4f83e317  // sdot v23.4s, v24.16b, v3.4b[0]\n"
       "126:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1968,15 +1967,15 @@
       "150:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 151f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 152f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1987,10 +1986,10 @@
       "b 152f\n"
       "151:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "152:"  // Height 5: input setup done
       "cmp x27, #0x10\n"
       "blt 155f\n"
@@ -2013,7 +2012,7 @@
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q29, [x10, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2022,100 +2021,100 @@
       "cmp x27, #0x20\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q28, [x10, #0x30]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
       "ldr q2, [x24, #0x0]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
       "ldr q3, [x23, #0x0]\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
       "ldr q4, [x22, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 153b\n"
@@ -2129,7 +2128,7 @@
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q29, [x10, #0x20]\n"
       ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
       "add x22, x22, #0x10\n"
       ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2138,131 +2137,131 @@
       "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q28, [x10, #0x30]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      ".inst 0x4fa0e3a8  // sdot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ac  // sdot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b0  // sdot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b4  // sdot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3b8  // sdot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      ".inst 0x4fa0e389  // sdot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38d  // sdot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e391  // sdot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e395  // sdot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e399  // sdot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      ".inst 0x4fa0e3aa  // sdot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e3ae  // sdot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e3b2  // sdot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e3b6  // sdot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e3ba  // sdot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      ".inst 0x4fa0e38b  // sdot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e38f  // sdot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e393  // sdot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e397  // sdot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e39b  // sdot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      ".inst 0x4f80eba8  // sdot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebac  // sdot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb0  // sdot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb4  // sdot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebb8  // sdot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      ".inst 0x4f80eb89  // sdot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8d  // sdot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb91  // sdot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb95  // sdot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb99  // sdot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      ".inst 0x4f80ebaa  // sdot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x4f81ebae  // sdot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x4f82ebb2  // sdot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x4f83ebb6  // sdot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x4f84ebba  // sdot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      ".inst 0x4f80eb8b  // sdot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x4f81eb8f  // sdot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x4f82eb93  // sdot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x4f83eb97  // sdot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x4f84eb9b  // sdot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      ".inst 0x4fa0eba8  // sdot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebac  // sdot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb0  // sdot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb4  // sdot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebb8  // sdot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      ".inst 0x4fa0eb89  // sdot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8d  // sdot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb91  // sdot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb95  // sdot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb99  // sdot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
-      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa0ebaa  // sdot v10.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x4fa1ebae  // sdot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x4fa2ebb2  // sdot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x4fa3ebb6  // sdot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x4fa4ebba  // sdot v26.4s, v29.16b, v4.4b[3]\n"
+      ".inst 0x4fa0eb8b  // sdot v11.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x4fa1eb8f  // sdot v15.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x4fa2eb93  // sdot v19.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x4fa3eb97  // sdot v23.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x4fa4eb9b  // sdot v27.4s, v28.16b, v4.4b[3]\n"
       "155:"  // Height 5: Multiply loop: Main loop skip
       "cbz x27, 160f\n"
       "cmp x27, #0x4\n"
       "blt 157f\n"
       "156:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
       "ldr s1, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s31, [x23], #0x4\n"
+      "ldr s30, [x22], #0x4\n"
+      "ldr q29, [x10, #0x0]\n"
+      ".inst 0x4f82e3a8  // sdot v8.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      ".inst 0x4f80e3b0  // sdot v16.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b4  // sdot v20.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3b8  // sdot v24.4s, v29.16b, v30.4b[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      ".inst 0x4f82e389  // sdot v9.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e391  // sdot v17.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe395  // sdot v21.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee399  // sdot v25.4s, v28.16b, v30.4b[0]\n"
+      "ldr q28, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f82e3aa  // sdot v10.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f80e3b2  // sdot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe3b6  // sdot v22.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee3ba  // sdot v26.4s, v29.16b, v30.4b[0]\n"
+      ".inst 0x4f82e38b  // sdot v11.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f80e393  // sdot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f9fe397  // sdot v23.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x4f9ee39b  // sdot v27.4s, v28.16b, v30.4b[0]\n"
       "bge 156b\n"
       "157:"  // Height 5: Multiply loop: Skip odd blocks
       "cbz x27, 160f\n"
@@ -2286,31 +2285,31 @@
       "ldr b3, [x23, #0x0]\n"
       "ldr b4, [x22, #0x0]\n"
       "159:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q29, [x10, #0x0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      ".inst 0x4f80e3a8  // sdot v8.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ac  // sdot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b0  // sdot v16.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b4  // sdot v20.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3b8  // sdot v24.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      ".inst 0x4f80e389  // sdot v9.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38d  // sdot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e391  // sdot v17.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e395  // sdot v21.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e399  // sdot v25.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f80e3aa  // sdot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x4f81e3ae  // sdot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x4f82e3b2  // sdot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x4f83e3b6  // sdot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x4f84e3ba  // sdot v26.4s, v29.16b, v4.4b[0]\n"
+      ".inst 0x4f80e38b  // sdot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x4f81e38f  // sdot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x4f82e393  // sdot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x4f83e397  // sdot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x4f84e39b  // sdot v27.4s, v28.16b, v4.4b[0]\n"
       "160:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -2672,16 +2671,16 @@
       "184:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 185f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 186f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -2693,11 +2692,11 @@
       "b 186f\n"
       "185:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "186:"  // Height 6: input setup done
       "cmp x27, #0x10\n"
       "blt 189f\n"
@@ -2976,43 +2975,43 @@
       "cmp x27, #0x4\n"
       "blt 191f\n"
       "190:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s7, [x26], #0x4\n"
+      "ldr s6, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr s5, [x21], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s3, [x22], #0x4\n"
+      "ldr s2, [x21], #0x4\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x4f87e028  // sdot v8.4s, v1.16b, v7.4b[0]\n"
+      ".inst 0x4f86e02c  // sdot v12.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e030  // sdot v16.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e034  // sdot v20.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e038  // sdot v24.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03c  // sdot v28.4s, v1.16b, v2.4b[0]\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x4f87e009  // sdot v9.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00d  // sdot v13.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e011  // sdot v17.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e015  // sdot v21.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e019  // sdot v25.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01d  // sdot v29.4s, v0.16b, v2.4b[0]\n"
+      "ldr q0, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f87e02a  // sdot v10.4s, v1.16b, v7.4b[0]\n"
+      ".inst 0x4f86e02e  // sdot v14.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x4f85e032  // sdot v18.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x4f84e036  // sdot v22.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x4f83e03a  // sdot v26.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x4f82e03e  // sdot v30.4s, v1.16b, v2.4b[0]\n"
+      ".inst 0x4f87e00b  // sdot v11.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x4f86e00f  // sdot v15.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x4f85e013  // sdot v19.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x4f84e017  // sdot v23.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x4f83e01b  // sdot v27.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x4f82e01f  // sdot v31.4s, v0.16b, v2.4b[0]\n"
       "bge 190b\n"
       "191:"  // Height 6: Multiply loop: Skip odd blocks
       "cbz x27, 194f\n"
@@ -3039,35 +3038,35 @@
       "ldr b4, [x22, #0x0]\n"
       "ldr b5, [x21, #0x0]\n"
       "193:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x4f80e0e8  // sdot v8.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ec  // sdot v12.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f0  // sdot v16.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f4  // sdot v20.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f8  // sdot v24.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fc  // sdot v28.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x4f80e0c9  // sdot v9.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cd  // sdot v13.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d1  // sdot v17.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d5  // sdot v21.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d9  // sdot v25.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dd  // sdot v29.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0ea  // sdot v10.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ee  // sdot v14.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f2  // sdot v18.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f6  // sdot v22.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fa  // sdot v26.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fe  // sdot v30.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0cb  // sdot v11.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0cf  // sdot v15.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d3  // sdot v19.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d7  // sdot v23.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0db  // sdot v27.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0df  // sdot v31.4s, v6.16b, v5.4b[0]\n"
       "194:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -3254,7 +3253,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "206:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp
index 50ccb6f..4905ba5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -92,7 +92,7 @@
                 case CPUModel::A510:
                     return { 33.62, 3.92, 0.48 };
                 case CPUModel::V1:
-                    return { 86.36, 19.25, 0.92 };
+                    return { 63.94, 16.18, 0.83 };
             }
         }
 
@@ -109,5 +109,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp
index f48623e..f8a76b5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp
@@ -77,7 +77,6 @@
     ka.N = N;
     ka.B_ptr = B_ptr;
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 186f\n"
@@ -178,11 +177,11 @@
       "15:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 16f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 17f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -198,41 +197,41 @@
       "ldr q6, [x10, #0x10]\n"
       "blt 19f\n"
       "18:"  // Height 1: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      "trn1 v19.2d, v1.2d, v20.2d\n"
+      ".inst 0x4e87a668  // smmla v8.4s, v19.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x4e86a66c  // smmla v12.4s, v19.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x4e92a669  // smmla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x4e91a66d  // smmla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e92a66a  // smmla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x4e91a66e  // smmla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v20.2d\n"
+      ".inst 0x4e92a66b  // smmla v11.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x4e91a66f  // smmla v15.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x4e92a428  // smmla v8.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x4e91a42c  // smmla v12.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x4e92a429  // smmla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x4e91a42d  // smmla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x4e92a42a  // smmla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x4e91a42e  // smmla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
       "sub x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
       "cmp x27, #0x20\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e92a42b  // smmla v11.4s, v1.16b, v18.16b\n"
+      ".inst 0x4e91a42f  // smmla v15.4s, v1.16b, v17.16b\n"
       "ldr q1, [x26, #0x0]\n"
       "add x10, x10, #0x100\n"
       "ldr q7, [x10, #0x0]\n"
@@ -240,40 +239,40 @@
       "prfm pldl1keep, [x26, #0x80]\n"
       "bge 18b\n"
       "19:"  // Height 1: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      "trn1 v20.2d, v1.2d, v21.2d\n"
+      ".inst 0x4e87a688  // smmla v8.4s, v20.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x4e86a68c  // smmla v12.4s, v20.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x4e92a689  // smmla v9.4s, v20.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x4e91a68d  // smmla v13.4s, v20.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e92a68a  // smmla v10.4s, v20.16b, v18.16b\n"
+      "ldr q19, [x10, #0x60]\n"
+      ".inst 0x4e91a68e  // smmla v14.4s, v20.16b, v17.16b\n"
+      "ldr q18, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v21.2d\n"
+      ".inst 0x4e93a68b  // smmla v11.4s, v20.16b, v19.16b\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x4e92a68f  // smmla v15.4s, v20.16b, v18.16b\n"
+      "ldr q19, [x10, #0x90]\n"
+      ".inst 0x4e91a428  // smmla v8.4s, v1.16b, v17.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x4e93a42c  // smmla v12.4s, v1.16b, v19.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x4e92a429  // smmla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x4e91a42d  // smmla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x4e92a42a  // smmla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x4e91a42e  // smmla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
       "add x26, x26, #0x10\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e92a42b  // smmla v11.4s, v1.16b, v18.16b\n"
+      ".inst 0x4e91a42f  // smmla v15.4s, v1.16b, v17.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "add x10, x10, #0x100\n"
       "20:"  // Height 1: Multiply loop: Main loop skip
@@ -281,26 +280,26 @@
       "cmp x27, #0x8\n"
       "blt 22f\n"
       "21:"  // Height 1: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr q6, [x10, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x70]\n"
+      "ldr d19, [x26], #0x8\n"
+      "ldr q18, [x10, #0x0]\n"
+      "trn1 v19.2d, v19.2d, v17.2d\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x4e92a668  // smmla v8.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x4e91a66c  // smmla v12.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x4e92a669  // smmla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x4e91a66d  // smmla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e92a66a  // smmla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x4e91a66e  // smmla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
       "sub x27, x27, #0x8\n"
       "cmp x27, #0x8\n"
-      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e92a66b  // smmla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x4e91a66f  // smmla v15.4s, v19.16b, v17.16b\n"
       "add x10, x10, #0x80\n"
       "bge 21b\n"
       "22:"  // Height 1: Multiply loop: Skip odd blocks
@@ -325,23 +324,23 @@
       "25:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr b1, [x26, #0x0]\n"
       "26:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "ldr q23, [x10, #0x0]\n"
+      "ldr q18, [x10, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v17.2d\n"
+      ".inst 0x4e97a668  // smmla v8.4s, v19.16b, v23.16b\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x4e92a66c  // smmla v12.4s, v19.16b, v18.16b\n"
+      "ldr q31, [x10, #0x30]\n"
+      ".inst 0x4e91a669  // smmla v9.4s, v19.16b, v17.16b\n"
+      "ldr q20, [x10, #0x40]\n"
+      ".inst 0x4e9fa66d  // smmla v13.4s, v19.16b, v31.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e94a66a  // smmla v10.4s, v19.16b, v20.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x4e91a66e  // smmla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x4e92a66b  // smmla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x4e91a66f  // smmla v15.4s, v19.16b, v17.16b\n"
       "add x10, x10, #0x80\n"
       "27:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -525,12 +524,12 @@
       "52:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 53f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 54f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -538,7 +537,7 @@
       "b 54f\n"
       "53:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
+      "add x25, x26, x21\n"
       "54:"  // Height 2: input setup done
       "cmp x27, #0x10\n"
       "blt 57f\n"
@@ -549,85 +548,85 @@
       "ldr q6, [x10, #0x10]\n"
       "blt 56f\n"
       "55:"  // Height 2: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a668  // smmla v8.4s, v19.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x4e86a66c  // smmla v12.4s, v19.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x4e92a669  // smmla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x4e91a66d  // smmla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e92a66a  // smmla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x4e91a66e  // smmla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x4e92a66b  // smmla v11.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x4e91a66f  // smmla v15.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x4e92a428  // smmla v8.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x4e91a42c  // smmla v12.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x4e92a429  // smmla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x4e91a42d  // smmla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x4e92a42a  // smmla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x4e91a42e  // smmla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
       "sub x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "ldr q2, [x25, #0x0]\n"
       "cmp x27, #0x20\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e92a42b  // smmla v11.4s, v1.16b, v18.16b\n"
       "add x10, x10, #0x100\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e91a42f  // smmla v15.4s, v1.16b, v17.16b\n"
       "ldr q1, [x26, #0x0]\n"
       "ldr q6, [x10, #0x10]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "bge 55b\n"
       "56:"  // Height 2: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a668  // smmla v8.4s, v19.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x4e86a66c  // smmla v12.4s, v19.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x4e92a669  // smmla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x4e91a66d  // smmla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e92a66a  // smmla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x4e91a66e  // smmla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x4e92a66b  // smmla v11.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x4e91a66f  // smmla v15.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x4e92a428  // smmla v8.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x4e91a42c  // smmla v12.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x4e92a429  // smmla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x4e91a42d  // smmla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x4e92a42a  // smmla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x4e91a42e  // smmla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e92a42b  // smmla v11.4s, v1.16b, v18.16b\n"
+      ".inst 0x4e91a42f  // smmla v15.4s, v1.16b, v17.16b\n"
       "sub x27, x27, #0x10\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
@@ -637,27 +636,27 @@
       "cmp x27, #0x8\n"
       "blt 59f\n"
       "58:"  // Height 2: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d18, [x26], #0x8\n"
+      "ldr d17, [x25], #0x8\n"
+      "trn1 v19.2d, v18.2d, v17.2d\n"
       "sub x27, x27, #0x8\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
-      "ldr q6, [x10, #0x20]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
-      "ldr q6, [x10, #0x40]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
-      "ldr q6, [x10, #0x60]\n"
-      "ldr q7, [x10, #0x70]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q22, [x10, #0x10]\n"
+      ".inst 0x4e91a668  // smmla v8.4s, v19.16b, v17.16b\n"
+      ".inst 0x4e96a66c  // smmla v12.4s, v19.16b, v22.16b\n"
+      "ldr q1, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x4e81a669  // smmla v9.4s, v19.16b, v1.16b\n"
+      ".inst 0x4e91a66d  // smmla v13.4s, v19.16b, v17.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e92a66a  // smmla v10.4s, v19.16b, v18.16b\n"
+      ".inst 0x4e91a66e  // smmla v14.4s, v19.16b, v17.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      "ldr q17, [x10, #0x70]\n"
       "cmp x27, #0x8\n"
-      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e92a66b  // smmla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x4e91a66f  // smmla v15.4s, v19.16b, v17.16b\n"
       "add x10, x10, #0x80\n"
       "bge 58b\n"
       "59:"  // Height 2: Multiply loop: Skip odd blocks
@@ -689,23 +688,23 @@
       "ldr b1, [x26, #0x0]\n"
       "ldr b2, [x25, #0x0]\n"
       "63:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "ldr q18, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e92a668  // smmla v8.4s, v19.16b, v18.16b\n"
+      "ldr q5, [x10, #0x20]\n"
+      ".inst 0x4e91a66c  // smmla v12.4s, v19.16b, v17.16b\n"
+      "ldr q21, [x10, #0x30]\n"
+      ".inst 0x4e85a669  // smmla v9.4s, v19.16b, v5.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x4e95a66d  // smmla v13.4s, v19.16b, v21.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x4e92a66a  // smmla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x4e91a66e  // smmla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x4e92a66b  // smmla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x4e91a66f  // smmla v15.4s, v19.16b, v17.16b\n"
       "add x10, x10, #0x80\n"
       "64:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -953,13 +952,13 @@
       "89:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 90f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 91f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -968,8 +967,8 @@
       "b 91f\n"
       "90:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "91:"  // Height 3: input setup done
       "cmp x27, #0x10\n"
       "blt 94f\n"
@@ -981,167 +980,167 @@
       "ldr q6, [x10, #0x10]\n"
       "blt 93f\n"
       "92:"  // Height 3: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a788  // smmla v8.4s, v28.16b, v7.16b\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x4e87a770  // smmla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e86a78c  // smmla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x4e86a774  // smmla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
       "cmp x27, #0x20\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e9aa428  // smmla v8.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa470  // smmla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x4e99a42c  // smmla v12.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e99a474  // smmla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x4e9aa429  // smmla v9.4s, v1.16b, v26.16b\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e9aa471  // smmla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x4e99a42d  // smmla v13.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x4e99a475  // smmla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x4e9aa42a  // smmla v10.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa472  // smmla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x4e99a42e  // smmla v14.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a476  // smmla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e9aa42b  // smmla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa473  // smmla v19.4s, v3.16b, v26.16b\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e99a42f  // smmla v15.4s, v1.16b, v25.16b\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e99a477  // smmla v23.4s, v3.16b, v25.16b\n"
       "ldr q3, [x24, #0x0]\n"
       "ldr q6, [x10, #0x10]\n"
       "bge 92b\n"
       "93:"  // Height 3: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a788  // smmla v8.4s, v28.16b, v7.16b\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x4e87a770  // smmla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e86a78c  // smmla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x4e86a774  // smmla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
+      ".inst 0x4e9aa428  // smmla v8.4s, v1.16b, v26.16b\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e9aa470  // smmla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x4e99a42c  // smmla v12.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x4e99a474  // smmla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x4e9aa429  // smmla v9.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa471  // smmla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x4e99a42d  // smmla v13.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a475  // smmla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x4e9aa42a  // smmla v10.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa472  // smmla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x4e99a42e  // smmla v14.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a476  // smmla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e9aa42b  // smmla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa473  // smmla v19.4s, v3.16b, v26.16b\n"
+      ".inst 0x4e99a42f  // smmla v15.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a477  // smmla v23.4s, v3.16b, v25.16b\n"
       "94:"  // Height 3: Multiply loop: Main loop skip
       "cbz x27, 101f\n"
       "cmp x27, #0x8\n"
       "blt 96f\n"
       "95:"  // Height 3: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr q6, [x10, #0x0]\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr q26, [x10, #0x0]\n"
+      "trn1 v27.2d, v25.2d, v27.2d\n"
+      ".inst 0x4e9aa788  // smmla v8.4s, v28.16b, v26.16b\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x4e9aa770  // smmla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e99a78c  // smmla v12.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a774  // smmla v20.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
       "cmp x27, #0x8\n"
-      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
       "bge 95b\n"
       "96:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x27, 101f\n"
@@ -1179,33 +1178,33 @@
       "ldr b2, [x25, #0x0]\n"
       "ldr b3, [x24, #0x0]\n"
       "100:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q29, [x10, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v25.2d\n"
+      ".inst 0x4e9aa788  // smmla v8.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa770  // smmla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e9da78c  // smmla v12.4s, v28.16b, v29.16b\n"
+      ".inst 0x4e9da774  // smmla v20.4s, v27.16b, v29.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
       "101:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1499,14 +1498,14 @@
       "126:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 127f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 128f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1516,9 +1515,9 @@
       "b 128f\n"
       "127:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "128:"  // Height 4: input setup done
       "cmp x27, #0x10\n"
       "blt 131f\n"
@@ -1531,173 +1530,173 @@
       "ldr q6, [x10, #0x10]\n"
       "blt 130f\n"
       "129:"  // Height 4: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a788  // smmla v8.4s, v28.16b, v7.16b\n"
       "sub x27, x27, #0x10\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a770  // smmla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e86a78c  // smmla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x4e86a774  // smmla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
       "add x23, x23, #0x10\n"
       "ldr q4, [x23, #0x0]\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e9aa428  // smmla v8.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa470  // smmla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x4e99a42c  // smmla v12.4s, v1.16b, v25.16b\n"
       "cmp x27, #0x20\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e99a474  // smmla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x4e9aa429  // smmla v9.4s, v1.16b, v26.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e9aa471  // smmla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x4e99a42d  // smmla v13.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e99a475  // smmla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x4e9aa42a  // smmla v10.4s, v1.16b, v26.16b\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e9aa472  // smmla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x4e99a42e  // smmla v14.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x4e99a476  // smmla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e9aa42b  // smmla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa473  // smmla v19.4s, v3.16b, v26.16b\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e99a42f  // smmla v15.4s, v1.16b, v25.16b\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e99a477  // smmla v23.4s, v3.16b, v25.16b\n"
       "ldr q3, [x24, #0x0]\n"
       "ldr q6, [x10, #0x10]\n"
       "bge 129b\n"
       "130:"  // Height 4: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a788  // smmla v8.4s, v28.16b, v7.16b\n"
       "add x26, x26, #0x10\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a770  // smmla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e86a78c  // smmla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x4e86a774  // smmla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
+      ".inst 0x4e9aa428  // smmla v8.4s, v1.16b, v26.16b\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e9aa470  // smmla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x4e99a42c  // smmla v12.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e99a474  // smmla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x4e9aa429  // smmla v9.4s, v1.16b, v26.16b\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x4e9aa471  // smmla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x4e99a42d  // smmla v13.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a475  // smmla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x4e9aa42a  // smmla v10.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa472  // smmla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x4e99a42e  // smmla v14.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a476  // smmla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e9aa42b  // smmla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x4e9aa473  // smmla v19.4s, v3.16b, v26.16b\n"
+      ".inst 0x4e99a42f  // smmla v15.4s, v1.16b, v25.16b\n"
+      ".inst 0x4e99a477  // smmla v23.4s, v3.16b, v25.16b\n"
       "131:"  // Height 4: Multiply loop: Main loop skip
       "cbz x27, 138f\n"
       "cmp x27, #0x8\n"
       "blt 133f\n"
       "132:"  // Height 4: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
       "sub x27, x27, #0x8\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d26, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "trn1 v27.2d, v26.2d, v25.2d\n"
       "cmp x27, #0x8\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x70]\n"
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x4e9aa788  // smmla v8.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa770  // smmla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e99a78c  // smmla v12.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a774  // smmla v20.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
       "bge 132b\n"
       "133:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x27, 138f\n"
@@ -1742,33 +1741,33 @@
       "ldr b3, [x24, #0x0]\n"
       "ldr b4, [x23, #0x0]\n"
       "137:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e9aa788  // smmla v8.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa770  // smmla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x4e99a78c  // smmla v12.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a774  // smmla v20.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x4e9aa789  // smmla v9.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa771  // smmla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x4e99a78d  // smmla v13.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a775  // smmla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x4e9aa78a  // smmla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa772  // smmla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x4e99a78e  // smmla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a776  // smmla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e9aa78b  // smmla v11.4s, v28.16b, v26.16b\n"
+      ".inst 0x4e9aa773  // smmla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x4e99a78f  // smmla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x4e99a777  // smmla v23.4s, v27.16b, v25.16b\n"
       "138:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -2125,15 +2124,15 @@
       "163:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 164f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 165f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -2144,10 +2143,10 @@
       "b 165f\n"
       "164:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "165:"  // Height 5: input setup done
       "cmp x27, #0x10\n"
       "blt 168f\n"
@@ -2160,174 +2159,174 @@
       "ldr q7, [x10, #0x0]\n"
       "blt 167f\n"
       "166:"  // Height 5: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v6.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a4c8  // smmla v8.4s, v6.16b, v7.16b\n"
       "trn1 v2.2d, v3.2d, v4.2d\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
       ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
       "sub x27, x27, #0x10\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "trn2 v5.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x10, #0x10]\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x10, #0x10]\n"
       ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e80a4cc  // smmla v12.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a454  // smmla v20.4s, v2.16b, v0.16b\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e80a49c  // smmla v28.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x4e87a4c9  // smmla v9.4s, v6.16b, v7.16b\n"
       "add x25, x25, #0x10\n"
       ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
       ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x40]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e80a4cd  // smmla v13.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a455  // smmla v21.4s, v2.16b, v0.16b\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e80a49d  // smmla v29.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x4e87a4ca  // smmla v10.4s, v6.16b, v7.16b\n"
       "cmp x27, #0x20\n"
       ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
       ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x60]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e80a4ce  // smmla v14.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a456  // smmla v22.4s, v2.16b, v0.16b\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e80a49e  // smmla v30.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x4e87a4cb  // smmla v11.4s, v6.16b, v7.16b\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
       ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x80]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e80a4cf  // smmla v15.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a457  // smmla v23.4s, v2.16b, v0.16b\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x4e80a49f  // smmla v31.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x90]\n"
       "ldr q4, [x23, #0x0]\n"
       ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
       ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
       ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      "ldr q6, [x10, #0xa0]\n"
+      ".inst 0x4e80a42c  // smmla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bc  // smmla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xb0]\n"
+      ".inst 0x4e86a429  // smmla v9.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a471  // smmla v17.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4b9  // smmla v25.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xc0]\n"
+      ".inst 0x4e80a42d  // smmla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bd  // smmla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xd0]\n"
+      ".inst 0x4e86a42a  // smmla v10.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a472  // smmla v18.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4ba  // smmla v26.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xe0]\n"
+      ".inst 0x4e80a42e  // smmla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4be  // smmla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x4e86a42b  // smmla v11.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a473  // smmla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bb  // smmla v27.4s, v5.16b, v6.16b\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e80a42f  // smmla v15.4s, v1.16b, v0.16b\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
       "ldr q3, [x24, #0x0]\n"
-      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      ".inst 0x4e80a4bf  // smmla v31.4s, v5.16b, v0.16b\n"
       "ldr q5, [x22, #0x0]\n"
       "bge 166b\n"
       "167:"  // Height 5: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v6.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a4c8  // smmla v8.4s, v6.16b, v7.16b\n"
       "trn1 v2.2d, v3.2d, v4.2d\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
       ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
       "add x26, x26, #0x10\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "trn2 v5.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x10, #0x10]\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x10, #0x10]\n"
       ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e80a4cc  // smmla v12.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a454  // smmla v20.4s, v2.16b, v0.16b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e80a49c  // smmla v28.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x4e87a4c9  // smmla v9.4s, v6.16b, v7.16b\n"
       "add x24, x24, #0x10\n"
       ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
       ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x40]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e80a4cd  // smmla v13.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a455  // smmla v21.4s, v2.16b, v0.16b\n"
       "add x22, x22, #0x10\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e80a49d  // smmla v29.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x4e87a4ca  // smmla v10.4s, v6.16b, v7.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
       ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x60]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e80a4ce  // smmla v14.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a456  // smmla v22.4s, v2.16b, v0.16b\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e80a49e  // smmla v30.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x4e87a4cb  // smmla v11.4s, v6.16b, v7.16b\n"
       "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
       ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x80]\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x4e80a4cf  // smmla v15.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e80a457  // smmla v23.4s, v2.16b, v0.16b\n"
+      ".inst 0x4e80a49f  // smmla v31.4s, v4.16b, v0.16b\n"
+      "ldr q2, [x10, #0x90]\n"
       ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
       ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
       ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
+      "ldr q0, [x10, #0xa0]\n"
+      ".inst 0x4e82a42c  // smmla v12.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a474  // smmla v20.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4bc  // smmla v28.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xb0]\n"
+      ".inst 0x4e80a429  // smmla v9.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a471  // smmla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4b9  // smmla v25.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xc0]\n"
+      ".inst 0x4e82a42d  // smmla v13.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a475  // smmla v21.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4bd  // smmla v29.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xd0]\n"
+      ".inst 0x4e80a42a  // smmla v10.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a472  // smmla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4ba  // smmla v26.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xe0]\n"
+      ".inst 0x4e82a42e  // smmla v14.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a476  // smmla v22.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4be  // smmla v30.4s, v5.16b, v2.16b\n"
       "ldr q6, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x4e80a42b  // smmla v11.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a473  // smmla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bb  // smmla v27.4s, v5.16b, v0.16b\n"
       ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
       ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
       ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
@@ -2337,48 +2336,48 @@
       "blt 170f\n"
       "169:"  // Height 5: Multiply loop: Odd block loop
       "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
       "sub x27, x27, #0x8\n"
-      "ldr d5, [x22], #0x8\n"
-      "ldr q6, [x10, #0x0]\n"
-      "trn1 v4.2d, v5.2d, v7.2d\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a498  // smmla v24.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
+      "ldr d0, [x22], #0x8\n"
+      "ldr q1, [x10, #0x0]\n"
+      "trn1 v2.2d, v0.2d, v2.2d\n"
+      ".inst 0x4e81a488  // smmla v8.4s, v4.16b, v1.16b\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x4e81a470  // smmla v16.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a458  // smmla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x4e80a48c  // smmla v12.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
       "cmp x27, #0x8\n"
-      ".inst 0x4e87a49c  // smmla v28.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a499  // smmla v25.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49d  // smmla v29.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49a  // smmla v26.4s, v4.16b, v6.16b\n"
+      ".inst 0x4e80a45c  // smmla v28.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x4e81a489  // smmla v9.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a471  // smmla v17.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a459  // smmla v25.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x40]\n"
+      ".inst 0x4e80a48d  // smmla v13.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45d  // smmla v29.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x4e81a48a  // smmla v10.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a472  // smmla v18.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45a  // smmla v26.4s, v2.16b, v1.16b\n"
       "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49e  // smmla v30.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e80a48e  // smmla v14.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45e  // smmla v30.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x4e86a48b  // smmla v11.4s, v4.16b, v6.16b\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49b  // smmla v27.4s, v4.16b, v6.16b\n"
-      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49f  // smmla v31.4s, v4.16b, v7.16b\n"
+      ".inst 0x4e86a473  // smmla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a45b  // smmla v27.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e80a48f  // smmla v15.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45f  // smmla v31.4s, v2.16b, v0.16b\n"
       "bge 169b\n"
       "170:"  // Height 5: Multiply loop: Skip odd blocks
       "cbz x27, 175f\n"
@@ -2430,42 +2429,42 @@
       "ldr b4, [x23, #0x0]\n"
       "ldr b5, [x22, #0x0]\n"
       "174:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x10, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      "trn1 v2.2d, v5.2d, v0.2d\n"
+      "ldr q1, [x10, #0x10]\n"
+      ".inst 0x4e86a4e8  // smmla v8.4s, v7.16b, v6.16b\n"
+      ".inst 0x4e86a470  // smmla v16.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a458  // smmla v24.4s, v2.16b, v6.16b\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x4e81a4ec  // smmla v12.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e81a474  // smmla v20.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45c  // smmla v28.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x4e80a4e9  // smmla v9.4s, v7.16b, v0.16b\n"
+      ".inst 0x4e80a471  // smmla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a459  // smmla v25.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x40]\n"
+      ".inst 0x4e81a4ed  // smmla v13.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e81a475  // smmla v21.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45d  // smmla v29.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x4e80a4ea  // smmla v10.4s, v7.16b, v0.16b\n"
+      ".inst 0x4e80a472  // smmla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45a  // smmla v26.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x60]\n"
+      ".inst 0x4e81a4ee  // smmla v14.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e81a476  // smmla v22.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45e  // smmla v30.4s, v2.16b, v1.16b\n"
       "ldr q6, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      ".inst 0x4e80a4eb  // smmla v11.4s, v7.16b, v0.16b\n"
+      ".inst 0x4e80a473  // smmla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45b  // smmla v27.4s, v2.16b, v0.16b\n"
+      ".inst 0x4e86a4ef  // smmla v15.4s, v7.16b, v6.16b\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a45f  // smmla v31.4s, v2.16b, v6.16b\n"
       "175:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -2872,16 +2871,16 @@
       "200:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 201f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 202f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -2893,11 +2892,11 @@
       "b 202f\n"
       "201:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "202:"  // Height 6: input setup done
       "cmp x27, #0x10\n"
       "blt 205f\n"
@@ -2964,42 +2963,42 @@
       "ldr q2, [x25, #0x0]\n"
       "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
+      "ldr q0, [x10, #0x90]\n"
       "ldr q4, [x23, #0x0]\n"
       ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
       ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
       ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      "ldr q6, [x10, #0xa0]\n"
+      ".inst 0x4e80a42c  // smmla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bc  // smmla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xb0]\n"
+      ".inst 0x4e86a429  // smmla v9.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a471  // smmla v17.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4b9  // smmla v25.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xc0]\n"
+      ".inst 0x4e80a42d  // smmla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bd  // smmla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xd0]\n"
+      ".inst 0x4e86a42a  // smmla v10.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a472  // smmla v18.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4ba  // smmla v26.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xe0]\n"
+      ".inst 0x4e80a42e  // smmla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4be  // smmla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x4e86a42b  // smmla v11.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a473  // smmla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bb  // smmla v27.4s, v5.16b, v6.16b\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e80a42f  // smmla v15.4s, v1.16b, v0.16b\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
       "ldr q3, [x24, #0x0]\n"
-      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      ".inst 0x4e80a4bf  // smmla v31.4s, v5.16b, v0.16b\n"
       "ldr q5, [x22, #0x0]\n"
       "ldr q6, [x21, #0x0]\n"
       "bge 203b\n"
@@ -3055,35 +3054,35 @@
       ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
       ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
       ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
+      "ldr q2, [x10, #0x90]\n"
       ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
       ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
       ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
+      "ldr q0, [x10, #0xa0]\n"
+      ".inst 0x4e82a42c  // smmla v12.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a474  // smmla v20.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4bc  // smmla v28.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xb0]\n"
+      ".inst 0x4e80a429  // smmla v9.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a471  // smmla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4b9  // smmla v25.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xc0]\n"
+      ".inst 0x4e82a42d  // smmla v13.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a475  // smmla v21.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4bd  // smmla v29.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xd0]\n"
+      ".inst 0x4e80a42a  // smmla v10.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a472  // smmla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4ba  // smmla v26.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xe0]\n"
+      ".inst 0x4e82a42e  // smmla v14.4s, v1.16b, v2.16b\n"
+      ".inst 0x4e82a476  // smmla v22.4s, v3.16b, v2.16b\n"
+      ".inst 0x4e82a4be  // smmla v30.4s, v5.16b, v2.16b\n"
       "ldr q6, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x4e80a42b  // smmla v11.4s, v1.16b, v0.16b\n"
+      ".inst 0x4e80a473  // smmla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a4bb  // smmla v27.4s, v5.16b, v0.16b\n"
       ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
       ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
       ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
@@ -3093,49 +3092,49 @@
       "blt 207f\n"
       "206:"  // Height 6: Multiply loop: Odd block loop
       "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
       "sub x27, x27, #0x8\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
       "cmp x27, #0x8\n"
-      "ldr d5, [x22], #0x8\n"
-      "ldr d7, [x21], #0x8\n"
-      "trn1 v4.2d, v5.2d, v7.2d\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a498  // smmla v24.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49c  // smmla v28.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a499  // smmla v25.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49d  // smmla v29.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49a  // smmla v26.4s, v4.16b, v6.16b\n"
+      "ldr d1, [x22], #0x8\n"
+      "ldr d0, [x21], #0x8\n"
+      "trn1 v2.2d, v1.2d, v0.2d\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x4e81a488  // smmla v8.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a470  // smmla v16.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a458  // smmla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x4e80a48c  // smmla v12.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a474  // smmla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45c  // smmla v28.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x4e81a489  // smmla v9.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a471  // smmla v17.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a459  // smmla v25.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x40]\n"
+      ".inst 0x4e80a48d  // smmla v13.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a475  // smmla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45d  // smmla v29.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x4e81a48a  // smmla v10.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e81a472  // smmla v18.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45a  // smmla v26.4s, v2.16b, v1.16b\n"
       "ldr q6, [x10, #0x60]\n"
-      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49e  // smmla v30.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x4e80a48e  // smmla v14.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a476  // smmla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45e  // smmla v30.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49b  // smmla v27.4s, v4.16b, v6.16b\n"
-      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49f  // smmla v31.4s, v4.16b, v7.16b\n"
+      ".inst 0x4e86a48b  // smmla v11.4s, v4.16b, v6.16b\n"
+      ".inst 0x4e86a473  // smmla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a45b  // smmla v27.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e80a48f  // smmla v15.4s, v4.16b, v0.16b\n"
+      ".inst 0x4e80a477  // smmla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45f  // smmla v31.4s, v2.16b, v0.16b\n"
       "bge 206b\n"
       "207:"  // Height 6: Multiply loop: Skip odd blocks
       "cbz x27, 212f\n"
@@ -3194,42 +3193,42 @@
       "ldr b5, [x22, #0x0]\n"
       "ldr b6, [x21, #0x0]\n"
       "211:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q0, [x10, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e80a4e8  // smmla v8.4s, v7.16b, v0.16b\n"
+      "trn1 v2.2d, v5.2d, v6.2d\n"
+      "ldr q1, [x10, #0x10]\n"
+      ".inst 0x4e80a470  // smmla v16.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a458  // smmla v24.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x4e81a4ec  // smmla v12.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e81a474  // smmla v20.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45c  // smmla v28.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x4e80a4e9  // smmla v9.4s, v7.16b, v0.16b\n"
+      ".inst 0x4e80a471  // smmla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a459  // smmla v25.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x40]\n"
+      ".inst 0x4e81a4ed  // smmla v13.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e81a475  // smmla v21.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45d  // smmla v29.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x4e80a4ea  // smmla v10.4s, v7.16b, v0.16b\n"
+      ".inst 0x4e80a472  // smmla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45a  // smmla v26.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x60]\n"
+      ".inst 0x4e81a4ee  // smmla v14.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e81a476  // smmla v22.4s, v3.16b, v1.16b\n"
+      ".inst 0x4e81a45e  // smmla v30.4s, v2.16b, v1.16b\n"
       "ldr q6, [x10, #0x70]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e80a4eb  // smmla v11.4s, v7.16b, v0.16b\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
-      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      ".inst 0x4e80a473  // smmla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x4e80a45b  // smmla v27.4s, v2.16b, v0.16b\n"
+      ".inst 0x4e86a4ef  // smmla v15.4s, v7.16b, v6.16b\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a45f  // smmla v31.4s, v2.16b, v6.16b\n"
       "212:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -3440,7 +3439,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "224:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
index ebc4342..14aba00 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -84,7 +84,7 @@
                 case CPUModel::A510:
                     return { 14.81 };
                 case CPUModel::V1:
-                    return { 48.36 };
+                    return { 44.54 };
             }
         }
 
@@ -108,5 +108,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
index b9caf54..00d063b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
@@ -78,329 +78,328 @@
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x4\n"
       "bge 91f\n"
       "cmp %x[M], #0x2\n"
       "bgt 61f\n"
       "beq 31f\n"
-      "mov x16, %x[col_bias]\n"
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v15.16b, #0x1\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
-      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
-      "mov x14, %x[output_ptr]\n"
-      "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "2:"  // Height 1: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
       "movi v18.4s, #0x0\n"
       "movi v19.4s, #0x0\n"
       "3:"  // Height 1: setup done
-      "mov x12, #0x0\n"
+      "mov x11, #0x0\n"
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w11, [x20, x12, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x10, [x21, #0x0]\n"
-      "cbnz x12, 6f\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "cbnz x11, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x10, x10, x20\n"
+      "add x9, x9, x20\n"
       "b 6f\n"
       "5:"  // Height 1: setup direct input
-      "mov x10, %x[input_ptr]\n"
+      "mov x9, %x[input_ptr]\n"
       "6:"  // Height 1: input setup done
-      "cmp x11, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 11f\n"
-      "ldr q0, [x10, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q4, [x13, #0x0]\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 9f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr d4, [x13, #0x70]\n"
-      "ldr x9, [x13, #0x78]\n"
+      "ldr d21, [x12, #0x70]\n"
+      "ldr x20, [x12, #0x78]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr d5, [x13, #0x80]\n"
+      "ldr d20, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr d6, [x13, #0x90]\n"
+      "ldr d26, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr d7, [x13, #0xa0]\n"
-      "mov v4.d[1], x9\n"
-      "ldr x28, [x13, #0x88]\n"
+      "ldr d25, [x12, #0xa0]\n"
+      "mov v21.d[1], x20\n"
+      "ldr x20, [x12, #0x88]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr d8, [x13, #0xb0]\n"
+      "ldr d24, [x12, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr d9, [x13, #0xc0]\n"
+      "ldr d23, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr d10, [x13, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      "ldr d4, [x13, #0xe0]\n"
-      "mov v5.d[1], x28\n"
-      "ldr x27, [x13, #0x98]\n"
-      "mov v6.d[1], x27\n"
-      "ldr x26, [x13, #0xa8]\n"
-      "mov v7.d[1], x26\n"
-      "ldr x25, [x13, #0xb8]\n"
-      "mov v8.d[1], x25\n"
-      "ldr x24, [x13, #0xc8]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr d5, [x13, #0xf0]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "ldr x20, [x13, #0xd8]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      "ldr x9, [x13, #0xe8]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      "ldr x28, [x13, #0xf8]\n"
-      "mov v9.d[1], x24\n"
-      "mov v10.d[1], x20\n"
-      "add x10, x10, #0x10\n"
-      "mov v4.d[1], x9\n"
-      "add x13, x13, #0x100\n"
-      "mov v5.d[1], x28\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      "ldr d22, [x12, #0xd0]\n"
+      ".inst 0x6fa0e2b3  // udot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr d21, [x12, #0xe0]\n"
+      "mov v20.d[1], x20\n"
+      "ldr x20, [x12, #0x98]\n"
+      "mov v26.d[1], x20\n"
+      "ldr x20, [x12, #0xa8]\n"
+      "mov v25.d[1], x20\n"
+      "ldr x20, [x12, #0xb8]\n"
+      "mov v24.d[1], x20\n"
+      "ldr x23, [x12, #0xc8]\n"
+      ".inst 0x6f80ea90  // udot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr d20, [x12, #0xf0]\n"
+      ".inst 0x6f80eb51  // udot v17.4s, v26.16b, v0.4b[2]\n"
+      "ldr x22, [x12, #0xd8]\n"
+      ".inst 0x6f80eb32  // udot v18.4s, v25.16b, v0.4b[2]\n"
+      "ldr x21, [x12, #0xe8]\n"
+      ".inst 0x6f80eb13  // udot v19.4s, v24.16b, v0.4b[2]\n"
+      "ldr x20, [x12, #0xf8]\n"
+      "mov v23.d[1], x23\n"
+      "mov v22.d[1], x22\n"
+      "add x9, x9, #0x10\n"
+      "mov v21.d[1], x21\n"
+      "add x12, x12, #0x100\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x6fa0eaf0  // udot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ead1  // udot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eab2  // udot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea93  // udot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 8f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "8:"  // Height 1: Multiply loop: unique 1: skip row sum
-      "ldr q0, [x10, #0x0]\n"
-      "sub x11, x11, #0x10\n"
-      "ldr q4, [x13, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
-      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q4, [x12, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "bge 7b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q4, [x13, #0x70]\n"
+      "ldr q21, [x12, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q5, [x13, #0x80]\n"
+      "ldr q20, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x13, #0x90]\n"
+      "ldr q26, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x13, #0xa0]\n"
+      "ldr q25, [x12, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q8, [x13, #0xb0]\n"
+      "ldr q24, [x12, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr q9, [x13, #0xc0]\n"
+      "ldr q23, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr q10, [x13, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      "ldr q4, [x13, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr q5, [x13, #0xf0]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "sub x11, x11, #0x10\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      "add x10, x10, #0x10\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      "add x13, x13, #0x100\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      "ldr q22, [x12, #0xd0]\n"
+      ".inst 0x6fa0e2b3  // udot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr q21, [x12, #0xe0]\n"
+      ".inst 0x6f80ea90  // udot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr q20, [x12, #0xf0]\n"
+      ".inst 0x6f80eb51  // udot v17.4s, v26.16b, v0.4b[2]\n"
+      "sub x10, x10, #0x10\n"
+      ".inst 0x6f80eb32  // udot v18.4s, v25.16b, v0.4b[2]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0x6f80eb13  // udot v19.4s, v24.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x6fa0eaf0  // udot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ead1  // udot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eab2  // udot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea93  // udot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 10f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "10:"  // Height 1: Multiply loop: unique 2: skip row sum
-      "prfm pldl1keep, [x10, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       "11:"  // Height 1: Multiply loop: Main loop skip
-      "cbz x11, 18f\n"
-      "cmp x11, #0x4\n"
+      "cbz x10, 18f\n"
+      "cmp x10, #0x4\n"
       "blt 14f\n"
       "12:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x10], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
       "tbnz %x[flags], #31, 13f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "13:"  // Height 1: Multiply loop: unique 3: skip row sum
-      "ldr q6, [x13, #0x0]\n"
-      "sub x11, x11, #0x4\n"
-      "ldr q7, [x13, #0x10]\n"
-      "cmp x11, #0x4\n"
-      "ldr q8, [x13, #0x20]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x13, #0x30]\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      "add x13, x13, #0x40\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q22, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q21, [x12, #0x20]\n"
+      ".inst 0x6f80e290  // udot v16.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x30]\n"
+      ".inst 0x6f80e2d1  // udot v17.4s, v22.16b, v0.4b[0]\n"
+      ".inst 0x6f80e2b2  // udot v18.4s, v21.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6f80e293  // udot v19.4s, v20.16b, v0.4b[0]\n"
       "bge 12b\n"
       "14:"  // Height 1: Multiply loop: Skip odd blocks
-      "cbz x11, 18f\n"
-      "tbz x11, #1, 15f\n"
-      "ldr h0, [x10], #0x2\n"
-      "tbz x11, #0, 16f\n"
-      "ld1 { v0.b }[2], [x10]\n"
+      "cbz x10, 18f\n"
+      "tbz x10, #1, 15f\n"
+      "ldr h0, [x9], #0x2\n"
+      "tbz x10, #0, 16f\n"
+      "ld1 { v0.b }[2], [x9]\n"
       "b 16f\n"
       "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x10, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
       "16:"  // Height 1: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 17f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "17:"  // Height 1: Multiply loop: unique 4: skip row sum
-      "ldr q10, [x13, #0x0]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x13, #0x10]\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x13, #0x20]\n"
-      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
-      "ldr q6, [x13, #0x30]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
-      "add x13, x13, #0x40\n"
+      "ldr q20, [x12, #0x0]\n"
+      ".inst 0x6f80e290  // udot v16.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x10]\n"
+      ".inst 0x6f80e291  // udot v17.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x20]\n"
+      ".inst 0x6f80e292  // udot v18.4s, v20.16b, v0.4b[0]\n"
+      "ldr q20, [x12, #0x30]\n"
+      ".inst 0x6f80e293  // udot v19.4s, v20.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
       "18:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x12, x12, #0x1\n"
-      "cmp x12, x20\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 4b\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
       "tbnz %x[flags], #31, 19f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v1.4s }, [x23]\n"
-      "neg v1.4s, v1.4s\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "neg v20.4s, v20.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "mul v11.4s, v11.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v20.4s\n"
       "19:"  // Height 1: skip row sum fixup
-      "ldr q0, [x16, #0x0]\n"
+      "ldr q23, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q1, [x16, #0x10]\n"
+      "ldr q22, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x16, #0x20]\n"
+      "ldr q21, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q3, [x16, #0x30]\n"
+      "ldr q20, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add v16.4s, v16.4s, v23.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v20.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "add x16, x16, #0x40\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v20.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v20.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 20f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "and v22.16b, v17.16b, v0.16b\n"
+      "and v21.16b, v18.16b, v0.16b\n"
+      "and v20.16b, v19.16b, v0.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "sqadd v17.4s, v17.4s, v22.4s\n"
+      "sqadd v18.4s, v18.4s, v21.4s\n"
+      "sqadd v19.4s, v19.4s, v20.4s\n"
       "20:"  // Height 1: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v20.4s\n"
+      "add v17.4s, v17.4s, v20.4s\n"
+      "add v18.4s, v18.4s, v20.4s\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v20.4s\n"
+      "smin v17.4s, v17.4s, v20.4s\n"
+      "smin v18.4s, v18.4s, v20.4s\n"
+      "smin v19.4s, v19.4s, v20.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v20.4s\n"
+      "smax v17.4s, v17.4s, v20.4s\n"
+      "smax v18.4s, v18.4s, v20.4s\n"
+      "smax v19.4s, v19.4s, v20.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "cmp x15, #0x10\n"
+      "cmp x14, #0x10\n"
       "uzp1 v16.16b, v16.16b, v17.16b\n"
       "bge 29f\n"
-      "tbz x15, #3, 24f\n"
-      "str d16, [x14], #0x8\n"
-      "tbz x15, #2, 22f\n"
-      "st1 { v16.s }[2], [x14], #0x4\n"
-      "tbz x15, #1, 21f\n"
-      "st1 { v16.h }[6], [x14], #0x2\n"
-      "tbz x15, #0, 28f\n"
-      "st1 { v16.b }[14], [x14]\n"
+      "tbz x14, #3, 24f\n"
+      "str d16, [x13], #0x8\n"
+      "tbz x14, #2, 22f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "tbz x14, #1, 21f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[14], [x13]\n"
       "b 28f\n"
       "21:"  // Height 1: Partial direct writeback: partial_1_12
-      "tbz x15, #0, 28f\n"
-      "st1 { v16.b }[12], [x14]\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[12], [x13]\n"
       "b 28f\n"
       "22:"  // Height 1: Partial direct writeback: partial_2_8
-      "tbz x15, #1, 23f\n"
-      "st1 { v16.h }[4], [x14], #0x2\n"
-      "tbz x15, #0, 28f\n"
-      "st1 { v16.b }[10], [x14]\n"
+      "tbz x14, #1, 23f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[10], [x13]\n"
       "b 28f\n"
       "23:"  // Height 1: Partial direct writeback: partial_1_8
-      "tbz x15, #0, 28f\n"
-      "st1 { v16.b }[8], [x14]\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[8], [x13]\n"
       "b 28f\n"
       "24:"  // Height 1: Partial direct writeback: partial_4_0
-      "tbz x15, #2, 26f\n"
-      "str s16, [x14], #0x4\n"
-      "tbz x15, #1, 25f\n"
-      "st1 { v16.h }[2], [x14], #0x2\n"
-      "tbz x15, #0, 28f\n"
-      "st1 { v16.b }[6], [x14]\n"
+      "tbz x14, #2, 26f\n"
+      "str s16, [x13], #0x4\n"
+      "tbz x14, #1, 25f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[6], [x13]\n"
       "b 28f\n"
       "25:"  // Height 1: Partial direct writeback: partial_1_4
-      "tbz x15, #0, 28f\n"
-      "st1 { v16.b }[4], [x14]\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[4], [x13]\n"
       "b 28f\n"
       "26:"  // Height 1: Partial direct writeback: partial_2_0
-      "tbz x15, #1, 27f\n"
-      "str h16, [x14], #0x2\n"
-      "tbz x15, #0, 28f\n"
-      "st1 { v16.b }[2], [x14]\n"
+      "tbz x14, #1, 27f\n"
+      "str h16, [x13], #0x2\n"
+      "tbz x14, #0, 28f\n"
+      "st1 { v16.b }[2], [x13]\n"
       "b 28f\n"
       "27:"  // Height 1: Partial direct writeback: partial_1_0
-      "str b16, [x14, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
       "28:"  // Height 1: Partial direct writeback: Done
       "b 30f\n"
       "29:"  // Height 1: Full writeback
-      "str q16, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
       "30:"  // Height 1: Writeback done
-      "subs x15, x15, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 2b\n"
       "b 122f\n"
       "31:"  // Height 2
-      "mov x16, %x[col_bias]\n"
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v12.4s, #0x0\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v15.16b, #0x1\n"
-      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
       "32:"  // Height 2: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -411,307 +410,307 @@
       "movi v22.4s, #0x0\n"
       "movi v23.4s, #0x0\n"
       "33:"  // Height 2: setup done
-      "mov x12, #0x0\n"
+      "mov x11, #0x0\n"
       "34:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w11, [x20, x12, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 35f\n"
-      "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x10, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "cbnz x12, 36f\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x11, 36f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x10, x10, x20\n"
-      "add x23, x23, x20\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
       "b 36f\n"
       "35:"  // Height 2: setup direct input
-      "mov x10, %x[input_ptr]\n"
-      "add x23, x10, x20\n"
+      "mov x9, %x[input_ptr]\n"
+      "add x28, x9, x21\n"
       "36:"  // Height 2: input setup done
-      "cmp x11, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 41f\n"
-      "ldr q0, [x10, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q1, [x23, #0x0]\n"
-      "ldr q4, [x13, #0x0]\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 39f\n"
       "37:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr x9, [x13, #0x78]\n"
+      "ldr x20, [x12, #0x78]\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr d4, [x13, #0x70]\n"
+      "ldr d25, [x12, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "mov v4.d[1], x9\n"
+      "mov v25.d[1], x20\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr d5, [x13, #0x80]\n"
+      "ldr d24, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr x28, [x13, #0x88]\n"
+      "ldr x23, [x12, #0x88]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr d6, [x13, #0x90]\n"
+      "ldr d30, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr x27, [x13, #0x98]\n"
+      "ldr x22, [x12, #0x98]\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr d7, [x13, #0xa0]\n"
-      "ldr x26, [x13, #0xa8]\n"
+      "ldr d29, [x12, #0xa0]\n"
+      "ldr x21, [x12, #0xa8]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr d8, [x13, #0xb0]\n"
-      "ldr x25, [x13, #0xb8]\n"
+      "ldr d28, [x12, #0xb0]\n"
+      "ldr x20, [x12, #0xb8]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr d9, [x13, #0xc0]\n"
+      "ldr d27, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      "mov v5.d[1], x28\n"
+      "mov v24.d[1], x23\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr d10, [x13, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      "mov v6.d[1], x27\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr d4, [x13, #0xe0]\n"
-      "mov v7.d[1], x26\n"
-      "ldr x24, [x13, #0xc8]\n"
-      "mov v8.d[1], x25\n"
-      "ldr x20, [x13, #0xd8]\n"
-      "ldr x9, [x13, #0xe8]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr d5, [x13, #0xf0]\n"
-      "ldr x28, [x13, #0xf8]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      "mov v9.d[1], x24\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      "mov v10.d[1], x20\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      "mov v4.d[1], x9\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      "mov v5.d[1], x28\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      "add x10, x10, #0x10\n"
-      "add x23, x23, #0x10\n"
-      "add x13, x13, #0x100\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
+      "ldr d26, [x12, #0xd0]\n"
+      ".inst 0x6fa0e333  // udot v19.4s, v25.16b, v0.4b[1]\n"
+      "mov v30.d[1], x22\n"
+      ".inst 0x6fa1e337  // udot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr d25, [x12, #0xe0]\n"
+      "mov v29.d[1], x21\n"
+      "ldr x23, [x12, #0xc8]\n"
+      "mov v28.d[1], x20\n"
+      "ldr x22, [x12, #0xd8]\n"
+      "ldr x21, [x12, #0xe8]\n"
+      ".inst 0x6f80eb10  // udot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb14  // udot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr d24, [x12, #0xf0]\n"
+      "ldr x20, [x12, #0xf8]\n"
+      ".inst 0x6f80ebd1  // udot v17.4s, v30.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebd5  // udot v21.4s, v30.16b, v1.4b[2]\n"
+      "mov v27.d[1], x23\n"
+      ".inst 0x6f80ebb2  // udot v18.4s, v29.16b, v0.4b[2]\n"
+      "mov v26.d[1], x22\n"
+      ".inst 0x6f81ebb6  // udot v22.4s, v29.16b, v1.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x6f80eb93  // udot v19.4s, v28.16b, v0.4b[2]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x6f81eb97  // udot v23.4s, v28.16b, v1.4b[2]\n"
+      "add x9, x9, #0x10\n"
+      "add x28, x28, #0x10\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x6fa0eb70  // udot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb74  // udot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb51  // udot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb55  // udot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb32  // udot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb36  // udot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb13  // udot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb17  // udot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 38f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       "38:"  // Height 2: Multiply loop: unique 5: skip row sum
-      "ldr q0, [x10, #0x0]\n"
-      "sub x11, x11, #0x10\n"
-      "ldr q1, [x23, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q4, [x13, #0x0]\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
-      "prfm pldl1keep, [x10, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q1, [x28, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "bge 37b\n"
       "39:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "sub x11, x11, #0x10\n"
+      "sub x10, x10, #0x10\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q4, [x13, #0x70]\n"
+      "ldr q25, [x12, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "add x10, x10, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q5, [x13, #0x80]\n"
+      "ldr q24, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x13, #0x90]\n"
+      "ldr q30, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x13, #0xa0]\n"
+      "ldr q29, [x12, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr q8, [x13, #0xb0]\n"
+      "ldr q28, [x12, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr q9, [x13, #0xc0]\n"
+      "ldr q27, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr q10, [x13, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr q4, [x13, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr q5, [x13, #0xf0]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "add x13, x13, #0x100\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
+      "ldr q26, [x12, #0xd0]\n"
+      ".inst 0x6fa0e333  // udot v19.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e337  // udot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr q25, [x12, #0xe0]\n"
+      ".inst 0x6f80eb10  // udot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb14  // udot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr q24, [x12, #0xf0]\n"
+      ".inst 0x6f80ebd1  // udot v17.4s, v30.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x6f81ebd5  // udot v21.4s, v30.16b, v1.4b[2]\n"
+      ".inst 0x6f80ebb2  // udot v18.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebb6  // udot v22.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f80eb93  // udot v19.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb97  // udot v23.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6fa0eb70  // udot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb74  // udot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb51  // udot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb55  // udot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb32  // udot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb36  // udot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb13  // udot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb17  // udot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 40f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       "40:"  // Height 2: Multiply loop: unique 6: skip row sum
-      "prfm pldl1keep, [x10, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       "41:"  // Height 2: Multiply loop: Main loop skip
-      "cbz x11, 48f\n"
-      "cmp x11, #0x4\n"
+      "cbz x10, 48f\n"
+      "cmp x10, #0x4\n"
       "blt 44f\n"
       "42:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x10], #0x4\n"
-      "ldr s1, [x23], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
       "tbnz %x[flags], #31, 43f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       "43:"  // Height 2: Multiply loop: unique 7: skip row sum
-      "ldr q6, [x13, #0x0]\n"
-      "sub x11, x11, #0x4\n"
-      "ldr q7, [x13, #0x10]\n"
-      "cmp x11, #0x4\n"
-      "ldr q8, [x13, #0x20]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x13, #0x30]\n"
-      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      "add x13, x13, #0x40\n"
-      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
+      "ldr q27, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q26, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q25, [x12, #0x20]\n"
+      ".inst 0x6f80e370  // udot v16.4s, v27.16b, v0.4b[0]\n"
+      "ldr q24, [x12, #0x30]\n"
+      ".inst 0x6f81e374  // udot v20.4s, v27.16b, v1.4b[0]\n"
+      ".inst 0x6f80e351  // udot v17.4s, v26.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6f81e355  // udot v21.4s, v26.16b, v1.4b[0]\n"
+      ".inst 0x6f80e332  // udot v18.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e336  // udot v22.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x6f80e313  // udot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e317  // udot v23.4s, v24.16b, v1.4b[0]\n"
       "bge 42b\n"
       "44:"  // Height 2: Multiply loop: Skip odd blocks
-      "cbz x11, 48f\n"
-      "tbz x11, #1, 45f\n"
-      "ldr h0, [x10], #0x2\n"
-      "ldr h1, [x23], #0x2\n"
-      "tbz x11, #0, 46f\n"
-      "ld1 { v0.b }[2], [x10]\n"
-      "ld1 { v1.b }[2], [x23]\n"
+      "cbz x10, 48f\n"
+      "tbz x10, #1, 45f\n"
+      "ldr h0, [x9], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "tbz x10, #0, 46f\n"
+      "ld1 { v0.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x28]\n"
       "b 46f\n"
       "45:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x10, #0x0]\n"
-      "ldr b1, [x23, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
       "46:"  // Height 2: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 47f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       "47:"  // Height 2: Multiply loop: unique 8: skip row sum
-      "ldr q10, [x13, #0x0]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x13, #0x10]\n"
-      ".inst 0x6f81e154  // udot v20.4s, v10.16b, v1.4b[0]\n"
-      "ldr q5, [x13, #0x20]\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      "ldr q6, [x13, #0x30]\n"
-      ".inst 0x6f81e095  // udot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
-      "add x13, x13, #0x40\n"
-      ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d7  // udot v23.4s, v6.16b, v1.4b[0]\n"
+      "ldr q24, [x12, #0x0]\n"
+      ".inst 0x6f80e310  // udot v16.4s, v24.16b, v0.4b[0]\n"
+      "ldr q26, [x12, #0x10]\n"
+      ".inst 0x6f81e314  // udot v20.4s, v24.16b, v1.4b[0]\n"
+      "ldr q25, [x12, #0x20]\n"
+      ".inst 0x6f80e351  // udot v17.4s, v26.16b, v0.4b[0]\n"
+      "ldr q24, [x12, #0x30]\n"
+      ".inst 0x6f81e355  // udot v21.4s, v26.16b, v1.4b[0]\n"
+      ".inst 0x6f80e332  // udot v18.4s, v25.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6f81e336  // udot v22.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x6f80e313  // udot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e317  // udot v23.4s, v24.16b, v1.4b[0]\n"
       "48:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x12, x12, #0x1\n"
-      "cmp x12, x20\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 34b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x14, x20\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x23, x13, x20\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "tbnz %x[flags], #31, 49f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v2.4s }, [x23]\n"
-      "neg v2.4s, v2.4s\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "neg v24.4s, v24.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "mul v11.4s, v11.4s, v2.4s\n"
-      "mul v12.4s, v12.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v24.4s\n"
+      "mul v12.4s, v12.4s, v24.4s\n"
       "49:"  // Height 2: skip row sum fixup
-      "ldr q0, [x16, #0x0]\n"
+      "ldr q27, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q1, [x16, #0x10]\n"
+      "ldr q26, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x16, #0x20]\n"
+      "ldr q25, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q3, [x16, #0x30]\n"
+      "ldr q24, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add v16.4s, v16.4s, v27.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v25.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "add v20.4s, v20.4s, v27.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v25.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "add x16, x16, #0x40\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v24.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 50f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v24.16b, v16.16b, v0.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v24.4s\n"
+      "and v30.16b, v17.16b, v0.16b\n"
+      "and v29.16b, v18.16b, v0.16b\n"
+      "and v28.16b, v19.16b, v0.16b\n"
+      "and v27.16b, v20.16b, v0.16b\n"
+      "and v26.16b, v21.16b, v0.16b\n"
+      "and v25.16b, v22.16b, v0.16b\n"
+      "and v24.16b, v23.16b, v0.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v30.4s\n"
+      "sqadd v18.4s, v18.4s, v29.4s\n"
+      "sqadd v19.4s, v19.4s, v28.4s\n"
+      "sqadd v20.4s, v20.4s, v27.4s\n"
+      "sqadd v21.4s, v21.4s, v26.4s\n"
+      "sqadd v22.4s, v22.4s, v25.4s\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
       "50:"  // Height 2: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
@@ -721,122 +720,122 @@
       "srshl v21.4s, v21.4s, v0.4s\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v24.4s\n"
+      "add v17.4s, v17.4s, v24.4s\n"
+      "add v18.4s, v18.4s, v24.4s\n"
+      "add v19.4s, v19.4s, v24.4s\n"
+      "add v20.4s, v20.4s, v24.4s\n"
+      "add v21.4s, v21.4s, v24.4s\n"
+      "add v22.4s, v22.4s, v24.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v24.4s\n"
+      "smin v17.4s, v17.4s, v24.4s\n"
+      "smin v18.4s, v18.4s, v24.4s\n"
+      "smin v19.4s, v19.4s, v24.4s\n"
+      "smin v20.4s, v20.4s, v24.4s\n"
+      "smin v21.4s, v21.4s, v24.4s\n"
+      "smin v22.4s, v22.4s, v24.4s\n"
+      "smin v23.4s, v23.4s, v24.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
-      "cmp x15, #0x10\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v17.8h, v22.8h, v23.8h\n"
+      "cmp x14, #0x10\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v20.16b, v20.16b, v17.16b\n"
       "bge 59f\n"
-      "tbz x15, #3, 54f\n"
-      "str d16, [x14], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "tbz x15, #2, 52f\n"
-      "st1 { v16.s }[2], [x14], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "tbz x15, #1, 51f\n"
-      "st1 { v16.h }[6], [x14], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
-      "tbz x15, #0, 58f\n"
-      "st1 { v16.b }[14], [x14]\n"
-      "st1 { v20.b }[14], [x22]\n"
+      "tbz x14, #3, 54f\n"
+      "str d16, [x13], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "tbz x14, #2, 52f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "tbz x14, #1, 51f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[14], [x13]\n"
+      "st1 { v20.b }[14], [x23]\n"
       "b 58f\n"
       "51:"  // Height 2: Partial direct writeback: partial_1_12
-      "tbz x15, #0, 58f\n"
-      "st1 { v16.b }[12], [x14]\n"
-      "st1 { v20.b }[12], [x22]\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[12], [x13]\n"
+      "st1 { v20.b }[12], [x23]\n"
       "b 58f\n"
       "52:"  // Height 2: Partial direct writeback: partial_2_8
-      "tbz x15, #1, 53f\n"
-      "st1 { v16.h }[4], [x14], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
-      "tbz x15, #0, 58f\n"
-      "st1 { v16.b }[10], [x14]\n"
-      "st1 { v20.b }[10], [x22]\n"
+      "tbz x14, #1, 53f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[10], [x13]\n"
+      "st1 { v20.b }[10], [x23]\n"
       "b 58f\n"
       "53:"  // Height 2: Partial direct writeback: partial_1_8
-      "tbz x15, #0, 58f\n"
-      "st1 { v16.b }[8], [x14]\n"
-      "st1 { v20.b }[8], [x22]\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[8], [x13]\n"
+      "st1 { v20.b }[8], [x23]\n"
       "b 58f\n"
       "54:"  // Height 2: Partial direct writeback: partial_4_0
-      "tbz x15, #2, 56f\n"
-      "str s16, [x14], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "tbz x15, #1, 55f\n"
-      "st1 { v16.h }[2], [x14], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
-      "tbz x15, #0, 58f\n"
-      "st1 { v16.b }[6], [x14]\n"
-      "st1 { v20.b }[6], [x22]\n"
+      "tbz x14, #2, 56f\n"
+      "str s16, [x13], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "tbz x14, #1, 55f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[6], [x13]\n"
+      "st1 { v20.b }[6], [x23]\n"
       "b 58f\n"
       "55:"  // Height 2: Partial direct writeback: partial_1_4
-      "tbz x15, #0, 58f\n"
-      "st1 { v16.b }[4], [x14]\n"
-      "st1 { v20.b }[4], [x22]\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[4], [x13]\n"
+      "st1 { v20.b }[4], [x23]\n"
       "b 58f\n"
       "56:"  // Height 2: Partial direct writeback: partial_2_0
-      "tbz x15, #1, 57f\n"
-      "str h16, [x14], #0x2\n"
-      "str h20, [x22], #0x2\n"
-      "tbz x15, #0, 58f\n"
-      "st1 { v16.b }[2], [x14]\n"
-      "st1 { v20.b }[2], [x22]\n"
+      "tbz x14, #1, 57f\n"
+      "str h16, [x13], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "tbz x14, #0, 58f\n"
+      "st1 { v16.b }[2], [x13]\n"
+      "st1 { v20.b }[2], [x23]\n"
       "b 58f\n"
       "57:"  // Height 2: Partial direct writeback: partial_1_0
-      "str b16, [x14, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
       "58:"  // Height 2: Partial direct writeback: Done
       "b 60f\n"
       "59:"  // Height 2: Full writeback
-      "str q16, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q20, [x22, #0x0]\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "str q20, [x23, #0x0]\n"
       "60:"  // Height 2: Writeback done
-      "subs x15, x15, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 32b\n"
       "b 122f\n"
       "61:"  // Height 3
-      "mov x16, %x[col_bias]\n"
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v12.4s, #0x0\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v13.4s, #0x0\n"
-      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v15.16b, #0x1\n"
-      "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-      "mov x14, %x[output_ptr]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
       "62:"  // Height 3: Column loop
       "movi v16.4s, #0x0\n"
       "movi v17.4s, #0x0\n"
@@ -851,317 +850,317 @@
       "movi v26.4s, #0x0\n"
       "movi v27.4s, #0x0\n"
       "63:"  // Height 3: setup done
-      "mov x12, #0x0\n"
+      "mov x11, #0x0\n"
       "64:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w11, [x20, x12, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 65f\n"
-      "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x10, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "cbnz x12, 66f\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x27, [x20, #0x10]\n"
+      "cbnz x11, 66f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x10, x10, x20\n"
-      "add x23, x23, x20\n"
-      "add x22, x22, x20\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
+      "add x27, x27, x20\n"
       "b 66f\n"
       "65:"  // Height 3: setup direct input
-      "mov x10, %x[input_ptr]\n"
-      "add x23, x10, x20\n"
-      "add x22, x23, x20\n"
+      "mov x9, %x[input_ptr]\n"
+      "add x28, x9, x21\n"
+      "add x27, x28, x21\n"
       "66:"  // Height 3: input setup done
-      "cmp x11, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 71f\n"
-      "ldr q0, [x10, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q1, [x23, #0x0]\n"
-      "ldr q2, [x22, #0x0]\n"
-      "ldr q4, [x13, #0x0]\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 69f\n"
       "67:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr x9, [x13, #0x78]\n"
+      "ldr x20, [x12, #0x78]\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr x28, [x13, #0x88]\n"
+      "ldr x23, [x12, #0x88]\n"
       ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr d4, [x13, #0x70]\n"
+      "ldr d29, [x12, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "mov v4.d[1], x9\n"
+      "mov v29.d[1], x20\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr x27, [x13, #0x98]\n"
+      "ldr x22, [x12, #0x98]\n"
       ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr d5, [x13, #0x80]\n"
+      "ldr d28, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr x26, [x13, #0xa8]\n"
+      "ldr x21, [x12, #0xa8]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr x25, [x13, #0xb8]\n"
+      "ldr x20, [x12, #0xb8]\n"
       ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr d6, [x13, #0x90]\n"
+      "ldr d5, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "mov v5.d[1], x28\n"
+      "mov v28.d[1], x23\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
-      "mov v6.d[1], x27\n"
+      "mov v5.d[1], x22\n"
       ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr d7, [x13, #0xa0]\n"
+      "ldr d4, [x12, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "mov v7.d[1], x26\n"
+      "mov v4.d[1], x21\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr x24, [x13, #0xc8]\n"
+      "ldr x23, [x12, #0xc8]\n"
       ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr d8, [x13, #0xb0]\n"
+      "ldr d3, [x12, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "mov v8.d[1], x25\n"
+      "mov v3.d[1], x20\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr x20, [x13, #0xd8]\n"
+      "ldr x22, [x12, #0xd8]\n"
       ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
-      "ldr d9, [x13, #0xc0]\n"
+      "ldr d31, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr x9, [x13, #0xe8]\n"
+      "ldr x21, [x12, #0xe8]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr x28, [x13, #0xf8]\n"
+      "ldr x20, [x12, #0xf8]\n"
       ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr d10, [x13, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      "mov v9.d[1], x24\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      "mov v10.d[1], x20\n"
-      ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
-      "ldr d4, [x13, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      "mov v4.d[1], x9\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      "add x10, x10, #0x10\n"
-      ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr d5, [x13, #0xf0]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "mov v5.d[1], x28\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
-      "add x22, x22, #0x10\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      "add x13, x13, #0x100\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x6f82e91b  // udot v27.4s, v8.16b, v2.4b[2]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e938  // udot v24.4s, v9.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e959  // udot v25.4s, v10.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e89a  // udot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8bb  // udot v27.4s, v5.16b, v2.4b[3]\n"
+      "ldr d30, [x12, #0xd0]\n"
+      ".inst 0x6fa0e3b3  // udot v19.4s, v29.16b, v0.4b[1]\n"
+      "mov v31.d[1], x23\n"
+      ".inst 0x6fa1e3b7  // udot v23.4s, v29.16b, v1.4b[1]\n"
+      "mov v30.d[1], x22\n"
+      ".inst 0x6fa2e3bb  // udot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr d29, [x12, #0xe0]\n"
+      ".inst 0x6f80eb90  // udot v16.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x6f81eb94  // udot v20.4s, v28.16b, v1.4b[2]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0x6f82eb98  // udot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr d28, [x12, #0xf0]\n"
+      ".inst 0x6f80e8b1  // udot v17.4s, v5.16b, v0.4b[2]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x6f81e8b5  // udot v21.4s, v5.16b, v1.4b[2]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f82e8b9  // udot v25.4s, v5.16b, v2.4b[2]\n"
+      "add x27, x27, #0x10\n"
+      ".inst 0x6f80e892  // udot v18.4s, v4.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x6f81e896  // udot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6f82e89a  // udot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x6f80e873  // udot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x6f81e877  // udot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x6f82e87b  // udot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x6fa0ebf0  // udot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebf4  // udot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebf8  // udot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebd1  // udot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebd5  // udot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebd9  // udot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebb2  // udot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebb6  // udot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebba  // udot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa0eb93  // udot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb97  // udot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb9b  // udot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 68f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "68:"  // Height 3: Multiply loop: unique 9: skip row sum
-      "ldr q0, [x10, #0x0]\n"
-      "sub x11, x11, #0x10\n"
-      "ldr q1, [x23, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q2, [x22, #0x0]\n"
-      "ldr q4, [x13, #0x0]\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
-      "prfm pldl1keep, [x10, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q1, [x28, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "bge 67b\n"
       "69:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "sub x11, x11, #0x10\n"
+      "sub x10, x10, #0x10\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "add x10, x10, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q4, [x13, #0x70]\n"
+      "ldr q29, [x12, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "add x22, x22, #0x10\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q5, [x13, #0x80]\n"
+      "ldr q28, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
       ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x13, #0x90]\n"
+      "ldr q5, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x13, #0xa0]\n"
+      "ldr q4, [x12, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
       ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr q8, [x13, #0xb0]\n"
+      "ldr q3, [x12, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
-      "ldr q9, [x13, #0xc0]\n"
+      "ldr q31, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr q10, [x13, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
-      "ldr q4, [x13, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr q5, [x13, #0xf0]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "add x13, x13, #0x100\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x6f82e91b  // udot v27.4s, v8.16b, v2.4b[2]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e938  // udot v24.4s, v9.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e959  // udot v25.4s, v10.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e89a  // udot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8bb  // udot v27.4s, v5.16b, v2.4b[3]\n"
+      "ldr q30, [x12, #0xd0]\n"
+      ".inst 0x6fa0e3b3  // udot v19.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3b7  // udot v23.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3bb  // udot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr q29, [x12, #0xe0]\n"
+      ".inst 0x6f80eb90  // udot v16.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb94  // udot v20.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb98  // udot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr q28, [x12, #0xf0]\n"
+      ".inst 0x6f80e8b1  // udot v17.4s, v5.16b, v0.4b[2]\n"
+      "add x12, x12, #0x100\n"
+      ".inst 0x6f81e8b5  // udot v21.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8b9  // udot v25.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x6f80e892  // udot v18.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x6f81e896  // udot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6f82e89a  // udot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x6f80e873  // udot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x6f81e877  // udot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x6f82e87b  // udot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x6fa0ebf0  // udot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebf4  // udot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebf8  // udot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebd1  // udot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebd5  // udot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebd9  // udot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebb2  // udot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebb6  // udot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebba  // udot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa0eb93  // udot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb97  // udot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb9b  // udot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 70f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "70:"  // Height 3: Multiply loop: unique 10: skip row sum
-      "prfm pldl1keep, [x10, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
       "71:"  // Height 3: Multiply loop: Main loop skip
-      "cbz x11, 78f\n"
-      "cmp x11, #0x4\n"
+      "cbz x10, 78f\n"
+      "cmp x10, #0x4\n"
       "blt 74f\n"
       "72:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x10], #0x4\n"
-      "ldr s1, [x23], #0x4\n"
-      "ldr s2, [x22], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x27], #0x4\n"
       "tbnz %x[flags], #31, 73f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "73:"  // Height 3: Multiply loop: unique 11: skip row sum
-      "ldr q6, [x13, #0x0]\n"
-      "sub x11, x11, #0x4\n"
-      "ldr q7, [x13, #0x10]\n"
-      "cmp x11, #0x4\n"
-      "ldr q8, [x13, #0x20]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x13, #0x30]\n"
-      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d8  // udot v24.4s, v6.16b, v2.4b[0]\n"
-      "add x13, x13, #0x40\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f9  // udot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x6f82e11a  // udot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x6f82e13b  // udot v27.4s, v9.16b, v2.4b[0]\n"
+      "ldr q31, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q30, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q29, [x12, #0x20]\n"
+      ".inst 0x6f80e3f0  // udot v16.4s, v31.16b, v0.4b[0]\n"
+      "ldr q28, [x12, #0x30]\n"
+      ".inst 0x6f81e3f4  // udot v20.4s, v31.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3f8  // udot v24.4s, v31.16b, v2.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6f80e3d1  // udot v17.4s, v30.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3d5  // udot v21.4s, v30.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3d9  // udot v25.4s, v30.16b, v2.4b[0]\n"
+      ".inst 0x6f80e3b2  // udot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3b6  // udot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3ba  // udot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f80e393  // udot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e397  // udot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e39b  // udot v27.4s, v28.16b, v2.4b[0]\n"
       "bge 72b\n"
       "74:"  // Height 3: Multiply loop: Skip odd blocks
-      "cbz x11, 78f\n"
-      "tbz x11, #1, 75f\n"
-      "ldr h0, [x10], #0x2\n"
-      "ldr h1, [x23], #0x2\n"
-      "ldr h2, [x22], #0x2\n"
-      "tbz x11, #0, 76f\n"
-      "ld1 { v0.b }[2], [x10]\n"
-      "ld1 { v1.b }[2], [x23]\n"
-      "ld1 { v2.b }[2], [x22]\n"
+      "cbz x10, 78f\n"
+      "tbz x10, #1, 75f\n"
+      "ldr h0, [x9], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x27], #0x2\n"
+      "tbz x10, #0, 76f\n"
+      "ld1 { v0.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x27]\n"
       "b 76f\n"
       "75:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x10, #0x0]\n"
-      "ldr b1, [x23, #0x0]\n"
-      "ldr b2, [x22, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x27, #0x0]\n"
       "76:"  // Height 3: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 77f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "77:"  // Height 3: Multiply loop: unique 12: skip row sum
-      "ldr q10, [x13, #0x0]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x13, #0x10]\n"
-      ".inst 0x6f81e154  // udot v20.4s, v10.16b, v1.4b[0]\n"
-      "ldr q5, [x13, #0x20]\n"
-      ".inst 0x6f82e158  // udot v24.4s, v10.16b, v2.4b[0]\n"
-      "ldr q6, [x13, #0x30]\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x6f81e095  // udot v21.4s, v4.16b, v1.4b[0]\n"
-      "add x13, x13, #0x40\n"
-      ".inst 0x6f82e099  // udot v25.4s, v4.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0ba  // udot v26.4s, v5.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d7  // udot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0db  // udot v27.4s, v6.16b, v2.4b[0]\n"
+      "ldr q28, [x12, #0x0]\n"
+      ".inst 0x6f80e390  // udot v16.4s, v28.16b, v0.4b[0]\n"
+      "ldr q30, [x12, #0x10]\n"
+      ".inst 0x6f81e394  // udot v20.4s, v28.16b, v1.4b[0]\n"
+      "ldr q29, [x12, #0x20]\n"
+      ".inst 0x6f82e398  // udot v24.4s, v28.16b, v2.4b[0]\n"
+      "ldr q28, [x12, #0x30]\n"
+      ".inst 0x6f80e3d1  // udot v17.4s, v30.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3d5  // udot v21.4s, v30.16b, v1.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6f82e3d9  // udot v25.4s, v30.16b, v2.4b[0]\n"
+      ".inst 0x6f80e3b2  // udot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3b6  // udot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3ba  // udot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f80e393  // udot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e397  // udot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e39b  // udot v27.4s, v28.16b, v2.4b[0]\n"
       "78:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x12, x12, #0x1\n"
-      "cmp x12, x20\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 64b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x14, x20\n"
-      "add x21, x22, x20\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
+      "add x23, x13, x20\n"
+      "add x22, x23, x20\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "tbnz %x[flags], #31, 79f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v3.4s }, [x23]\n"
-      "neg v3.4s, v3.4s\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "neg v28.4s, v28.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "mul v11.4s, v11.4s, v3.4s\n"
-      "mul v12.4s, v12.4s, v3.4s\n"
-      "mul v13.4s, v13.4s, v3.4s\n"
+      "mul v11.4s, v11.4s, v28.4s\n"
+      "mul v12.4s, v12.4s, v28.4s\n"
+      "mul v13.4s, v13.4s, v28.4s\n"
       "79:"  // Height 3: skip row sum fixup
-      "ldr q0, [x16, #0x0]\n"
+      "ldr q31, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q1, [x16, #0x10]\n"
+      "ldr q30, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x16, #0x20]\n"
+      "ldr q29, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q3, [x16, #0x30]\n"
+      "ldr q28, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
@@ -1171,73 +1170,73 @@
       "add v25.4s, v25.4s, v13.4s\n"
       "add v26.4s, v26.4s, v13.4s\n"
       "add v27.4s, v27.4s, v13.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add v16.4s, v16.4s, v31.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v31.4s\n"
+      "add v21.4s, v21.4s, v30.4s\n"
+      "add v22.4s, v22.4s, v29.4s\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v31.4s\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
-      "add x16, x16, #0x40\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v28.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v28.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v28.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v28.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v28.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v28.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v28.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 80f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "and v1.16b, v16.16b, v0.16b\n"
+      "and v31.16b, v17.16b, v0.16b\n"
+      "and v30.16b, v18.16b, v0.16b\n"
+      "and v29.16b, v19.16b, v0.16b\n"
+      "and v28.16b, v20.16b, v0.16b\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v1.4s\n"
+      "sqadd v17.4s, v17.4s, v31.4s\n"
+      "sqadd v18.4s, v18.4s, v30.4s\n"
+      "sqadd v19.4s, v19.4s, v29.4s\n"
+      "sqadd v20.4s, v20.4s, v28.4s\n"
+      "and v3.16b, v21.16b, v0.16b\n"
+      "and v2.16b, v22.16b, v0.16b\n"
+      "and v1.16b, v23.16b, v0.16b\n"
+      "and v31.16b, v24.16b, v0.16b\n"
+      "and v30.16b, v25.16b, v0.16b\n"
+      "and v29.16b, v26.16b, v0.16b\n"
+      "and v28.16b, v27.16b, v0.16b\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v3.4s\n"
+      "sqadd v22.4s, v22.4s, v2.4s\n"
+      "sqadd v23.4s, v23.4s, v1.4s\n"
+      "sqadd v24.4s, v24.4s, v31.4s\n"
+      "sqadd v25.4s, v25.4s, v30.4s\n"
+      "sqadd v26.4s, v26.4s, v29.4s\n"
+      "sqadd v27.4s, v27.4s, v28.4s\n"
       "80:"  // Height 3: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
@@ -1251,156 +1250,156 @@
       "srshl v25.4s, v25.4s, v0.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "add v17.4s, v17.4s, v28.4s\n"
+      "add v18.4s, v18.4s, v28.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v28.4s\n"
+      "add v21.4s, v21.4s, v28.4s\n"
+      "add v22.4s, v22.4s, v28.4s\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v24.4s, v24.4s, v28.4s\n"
+      "add v25.4s, v25.4s, v28.4s\n"
+      "add v26.4s, v26.4s, v28.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v28.4s\n"
+      "smin v17.4s, v17.4s, v28.4s\n"
+      "smin v18.4s, v18.4s, v28.4s\n"
+      "smin v19.4s, v19.4s, v28.4s\n"
+      "smin v20.4s, v20.4s, v28.4s\n"
+      "smin v21.4s, v21.4s, v28.4s\n"
+      "smin v22.4s, v22.4s, v28.4s\n"
+      "smin v23.4s, v23.4s, v28.4s\n"
+      "smin v24.4s, v24.4s, v28.4s\n"
+      "smin v25.4s, v25.4s, v28.4s\n"
+      "smin v26.4s, v26.4s, v28.4s\n"
+      "smin v27.4s, v27.4s, v28.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v28.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v28.4s\n"
+      "smax v17.4s, v17.4s, v28.4s\n"
+      "smax v18.4s, v18.4s, v28.4s\n"
+      "smax v19.4s, v19.4s, v28.4s\n"
+      "smax v20.4s, v20.4s, v28.4s\n"
+      "smax v21.4s, v21.4s, v28.4s\n"
+      "smax v22.4s, v22.4s, v28.4s\n"
+      "smax v23.4s, v23.4s, v28.4s\n"
+      "smax v24.4s, v24.4s, v28.4s\n"
+      "smax v25.4s, v25.4s, v28.4s\n"
+      "smax v26.4s, v26.4s, v28.4s\n"
+      "smax v27.4s, v27.4s, v28.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v18.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
-      "cmp x15, #0x10\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "cmp x14, #0x10\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v20.16b, v20.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 89f\n"
-      "tbz x15, #3, 84f\n"
-      "str d16, [x14], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
-      "tbz x15, #2, 82f\n"
-      "st1 { v16.s }[2], [x14], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
-      "tbz x15, #1, 81f\n"
-      "st1 { v16.h }[6], [x14], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
-      "tbz x15, #0, 88f\n"
-      "st1 { v16.b }[14], [x14]\n"
-      "st1 { v20.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
+      "tbz x14, #3, 84f\n"
+      "str d16, [x13], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x14, #2, 82f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "tbz x14, #1, 81f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[14], [x13]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 88f\n"
       "81:"  // Height 3: Partial direct writeback: partial_1_12
-      "tbz x15, #0, 88f\n"
-      "st1 { v16.b }[12], [x14]\n"
-      "st1 { v20.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[12], [x13]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 88f\n"
       "82:"  // Height 3: Partial direct writeback: partial_2_8
-      "tbz x15, #1, 83f\n"
-      "st1 { v16.h }[4], [x14], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
-      "tbz x15, #0, 88f\n"
-      "st1 { v16.b }[10], [x14]\n"
-      "st1 { v20.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
+      "tbz x14, #1, 83f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[10], [x13]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 88f\n"
       "83:"  // Height 3: Partial direct writeback: partial_1_8
-      "tbz x15, #0, 88f\n"
-      "st1 { v16.b }[8], [x14]\n"
-      "st1 { v20.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[8], [x13]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 88f\n"
       "84:"  // Height 3: Partial direct writeback: partial_4_0
-      "tbz x15, #2, 86f\n"
-      "str s16, [x14], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
-      "tbz x15, #1, 85f\n"
-      "st1 { v16.h }[2], [x14], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
-      "tbz x15, #0, 88f\n"
-      "st1 { v16.b }[6], [x14]\n"
-      "st1 { v20.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
+      "tbz x14, #2, 86f\n"
+      "str s16, [x13], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "tbz x14, #1, 85f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[6], [x13]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 88f\n"
       "85:"  // Height 3: Partial direct writeback: partial_1_4
-      "tbz x15, #0, 88f\n"
-      "st1 { v16.b }[4], [x14]\n"
-      "st1 { v20.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[4], [x13]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 88f\n"
       "86:"  // Height 3: Partial direct writeback: partial_2_0
-      "tbz x15, #1, 87f\n"
-      "str h16, [x14], #0x2\n"
-      "str h20, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
-      "tbz x15, #0, 88f\n"
-      "st1 { v16.b }[2], [x14]\n"
-      "st1 { v20.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
+      "tbz x14, #1, 87f\n"
+      "str h16, [x13], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "tbz x14, #0, 88f\n"
+      "st1 { v16.b }[2], [x13]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 88f\n"
       "87:"  // Height 3: Partial direct writeback: partial_1_0
-      "str b16, [x14, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "88:"  // Height 3: Partial direct writeback: Done
       "b 90f\n"
       "89:"  // Height 3: Full writeback
-      "str q16, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q20, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "90:"  // Height 3: Writeback done
-      "subs x15, x15, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 62b\n"
       "b 122f\n"
       "91:"  // Height 4
       "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "mov x20, #0x4\n"
-      "mov x16, %x[col_bias]\n"
+      "mov x15, %x[col_bias]\n"
       "movi v11.4s, #0x0\n"
       "movi v12.4s, #0x0\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "movi v13.4s, #0x0\n"
-      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v14.4s, #0x0\n"
-      "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "movi v15.16b, #0x1\n"
-      "mov x14, %x[output_ptr]\n"
+      "mov x13, %x[output_ptr]\n"
       "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
       "92:"  // Height 4: Column loop
       "movi v16.4s, #0x0\n"
@@ -1420,117 +1419,117 @@
       "movi v30.4s, #0x0\n"
       "movi v31.4s, #0x0\n"
       "93:"  // Height 4: setup done
-      "mov x12, #0x0\n"
+      "mov x11, #0x0\n"
       "94:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-      "ldr w11, [x20, x12, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w10, [x20, x11, LSL #0x2]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 95f\n"
-      "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x10, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
-      "cbnz x12, 96f\n"
+      "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x9, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x27, [x20, #0x10]\n"
+      "ldr x26, [x20, #0x18]\n"
+      "cbnz x11, 96f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-      "add x10, x10, x20\n"
-      "add x23, x23, x20\n"
-      "add x22, x22, x20\n"
-      "add x21, x21, x20\n"
+      "add x9, x9, x20\n"
+      "add x28, x28, x20\n"
+      "add x27, x27, x20\n"
+      "add x26, x26, x20\n"
       "b 96f\n"
       "95:"  // Height 4: setup direct input
-      "mov x10, %x[input_ptr]\n"
-      "add x23, x10, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "mov x9, %x[input_ptr]\n"
+      "add x28, x9, x21\n"
+      "add x27, x28, x21\n"
+      "add x26, x27, x21\n"
       "96:"  // Height 4: input setup done
-      "cmp x11, #0x10\n"
+      "cmp x10, #0x10\n"
       "blt 101f\n"
-      "ldr q0, [x10, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q1, [x23, #0x0]\n"
-      "ldr q2, [x22, #0x0]\n"
-      "ldr q3, [x21, #0x0]\n"
-      "ldr q4, [x13, #0x0]\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q3, [x26, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
       "blt 99f\n"
       "97:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr x9, [x13, #0x78]\n"
+      "ldr x22, [x12, #0x78]\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr x28, [x13, #0x88]\n"
+      "ldr x21, [x12, #0x88]\n"
       ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr x27, [x13, #0x98]\n"
+      "ldr x20, [x12, #0x98]\n"
       ".inst 0x6f83e09c  // udot v28.4s, v4.16b, v3.4b[0]\n"
-      "ldr d4, [x13, #0x70]\n"
+      "ldr d4, [x12, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "mov v4.d[1], x9\n"
+      "mov v4.d[1], x22\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr x26, [x13, #0xa8]\n"
+      "ldr x25, [x12, #0xa8]\n"
       ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr x25, [x13, #0xb8]\n"
+      "ldr x24, [x12, #0xb8]\n"
       ".inst 0x6f83e0bd  // udot v29.4s, v5.16b, v3.4b[0]\n"
-      "ldr d5, [x13, #0x80]\n"
+      "ldr d5, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "mov v5.d[1], x28\n"
+      "mov v5.d[1], x21\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr x24, [x13, #0xc8]\n"
+      "ldr x23, [x12, #0xc8]\n"
       ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr x20, [x13, #0xd8]\n"
+      "ldr x22, [x12, #0xd8]\n"
       ".inst 0x6f83e0de  // udot v30.4s, v6.16b, v3.4b[0]\n"
-      "ldr d6, [x13, #0x90]\n"
+      "ldr d6, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x27\n"
+      "mov v6.d[1], x20\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr x9, [x13, #0xe8]\n"
+      "ldr x21, [x12, #0xe8]\n"
       ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr x28, [x13, #0xf8]\n"
+      "ldr x20, [x12, #0xf8]\n"
       ".inst 0x6f83e0ff  // udot v31.4s, v7.16b, v3.4b[0]\n"
-      "ldr d7, [x13, #0xa0]\n"
+      "ldr d7, [x12, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "mov v7.d[1], x26\n"
+      "mov v7.d[1], x25\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
-      "add x10, x10, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
-      "add x23, x23, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x6fa3e11c  // udot v28.4s, v8.16b, v3.4b[1]\n"
-      "ldr d8, [x13, #0xb0]\n"
+      "ldr d8, [x12, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "mov v8.d[1], x25\n"
+      "mov v8.d[1], x24\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
-      "add x22, x22, #0x10\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
-      "add x21, x21, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x6fa3e13d  // udot v29.4s, v9.16b, v3.4b[1]\n"
-      "ldr d9, [x13, #0xc0]\n"
+      "ldr d9, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      "mov v9.d[1], x24\n"
+      "mov v9.d[1], x23\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
       ".inst 0x6fa3e15e  // udot v30.4s, v10.16b, v3.4b[1]\n"
-      "ldr d10, [x13, #0xd0]\n"
+      "ldr d10, [x12, #0xd0]\n"
       ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      "mov v10.d[1], x20\n"
+      "mov v10.d[1], x22\n"
       ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
       ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
       ".inst 0x6fa3e09f  // udot v31.4s, v4.16b, v3.4b[1]\n"
-      "ldr d4, [x13, #0xe0]\n"
+      "ldr d4, [x12, #0xe0]\n"
       ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      "mov v4.d[1], x9\n"
+      "mov v4.d[1], x21\n"
       ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
       ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
       ".inst 0x6f83e8bc  // udot v28.4s, v5.16b, v3.4b[2]\n"
-      "ldr d5, [x13, #0xf0]\n"
+      "ldr d5, [x12, #0xf0]\n"
       ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "mov v5.d[1], x28\n"
+      "mov v5.d[1], x20\n"
       ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      "add x13, x13, #0x100\n"
+      "add x12, x12, #0x100\n"
       ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x6f83e8dd  // udot v29.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
@@ -1563,77 +1562,77 @@
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
       "98:"  // Height 4: Multiply loop: unique 13: skip row sum
-      "ldr q0, [x10, #0x0]\n"
-      "sub x11, x11, #0x10\n"
-      "ldr q1, [x23, #0x0]\n"
-      "cmp x11, #0x20\n"
-      "ldr q2, [x22, #0x0]\n"
-      "ldr q3, [x21, #0x0]\n"
-      "ldr q4, [x13, #0x0]\n"
-      "ldr q5, [x13, #0x10]\n"
-      "ldr q6, [x13, #0x20]\n"
-      "ldr q7, [x13, #0x30]\n"
-      "ldr q8, [x13, #0x40]\n"
-      "ldr q9, [x13, #0x50]\n"
-      "ldr q10, [x13, #0x60]\n"
-      "prfm pldl1keep, [x10, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "ldr q0, [x9, #0x0]\n"
+      "sub x10, x10, #0x10\n"
+      "ldr q1, [x28, #0x0]\n"
+      "cmp x10, #0x20\n"
+      "ldr q2, [x27, #0x0]\n"
+      "ldr q3, [x26, #0x0]\n"
+      "ldr q4, [x12, #0x0]\n"
+      "ldr q5, [x12, #0x10]\n"
+      "ldr q6, [x12, #0x20]\n"
+      "ldr q7, [x12, #0x30]\n"
+      "ldr q8, [x12, #0x40]\n"
+      "ldr q9, [x12, #0x50]\n"
+      "ldr q10, [x12, #0x60]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "bge 97b\n"
       "99:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "sub x11, x11, #0x10\n"
+      "sub x10, x10, #0x10\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "add x10, x10, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x6f83e09c  // udot v28.4s, v4.16b, v3.4b[0]\n"
-      "ldr q4, [x13, #0x70]\n"
+      "ldr q4, [x12, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "add x22, x22, #0x10\n"
+      "add x27, x27, #0x10\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "add x21, x21, #0x10\n"
+      "add x26, x26, #0x10\n"
       ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
       ".inst 0x6f83e0bd  // udot v29.4s, v5.16b, v3.4b[0]\n"
-      "ldr q5, [x13, #0x80]\n"
+      "ldr q5, [x12, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
       ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
       ".inst 0x6f83e0de  // udot v30.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x13, #0x90]\n"
+      "ldr q6, [x12, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x6f83e0ff  // udot v31.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x13, #0xa0]\n"
+      "ldr q7, [x12, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
       ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
       ".inst 0x6fa3e11c  // udot v28.4s, v8.16b, v3.4b[1]\n"
-      "ldr q8, [x13, #0xb0]\n"
+      "ldr q8, [x12, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
       ".inst 0x6fa3e13d  // udot v29.4s, v9.16b, v3.4b[1]\n"
-      "ldr q9, [x13, #0xc0]\n"
+      "ldr q9, [x12, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
       ".inst 0x6fa3e15e  // udot v30.4s, v10.16b, v3.4b[1]\n"
-      "ldr q10, [x13, #0xd0]\n"
+      "ldr q10, [x12, #0xd0]\n"
       ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
       ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
       ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
       ".inst 0x6fa3e09f  // udot v31.4s, v4.16b, v3.4b[1]\n"
-      "ldr q4, [x13, #0xe0]\n"
+      "ldr q4, [x12, #0xe0]\n"
       ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
       ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
       ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
       ".inst 0x6f83e8bc  // udot v28.4s, v5.16b, v3.4b[2]\n"
-      "ldr q5, [x13, #0xf0]\n"
+      "ldr q5, [x12, #0xf0]\n"
       ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      "add x13, x13, #0x100\n"
+      "add x12, x12, #0x100\n"
       ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
       ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x6f83e8dd  // udot v29.4s, v6.16b, v3.4b[2]\n"
@@ -1667,67 +1666,67 @@
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
       "100:"  // Height 4: Multiply loop: unique 14: skip row sum
-      "prfm pldl1keep, [x10, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
       "101:"  // Height 4: Multiply loop: Main loop skip
-      "cbz x11, 108f\n"
-      "cmp x11, #0x4\n"
+      "cbz x10, 108f\n"
+      "cmp x10, #0x4\n"
       "blt 104f\n"
       "102:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x10], #0x4\n"
-      "ldr s1, [x23], #0x4\n"
-      "ldr s2, [x22], #0x4\n"
-      "ldr s3, [x21], #0x4\n"
+      "ldr s0, [x9], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x27], #0x4\n"
+      "ldr s3, [x26], #0x4\n"
       "tbnz %x[flags], #31, 103f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
       "103:"  // Height 4: Multiply loop: unique 15: skip row sum
-      "ldr q6, [x13, #0x0]\n"
-      "sub x11, x11, #0x4\n"
-      "ldr q7, [x13, #0x10]\n"
-      "cmp x11, #0x4\n"
-      "ldr q8, [x13, #0x20]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      "ldr q9, [x13, #0x30]\n"
-      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d8  // udot v24.4s, v6.16b, v2.4b[0]\n"
-      "add x13, x13, #0x40\n"
-      ".inst 0x6f83e0dc  // udot v28.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f9  // udot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0fd  // udot v29.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x6f82e11a  // udot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x6f83e11e  // udot v30.4s, v8.16b, v3.4b[0]\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x6f82e13b  // udot v27.4s, v9.16b, v2.4b[0]\n"
-      ".inst 0x6f83e13f  // udot v31.4s, v9.16b, v3.4b[0]\n"
+      "ldr q7, [x12, #0x0]\n"
+      "sub x10, x10, #0x4\n"
+      "ldr q6, [x12, #0x10]\n"
+      "cmp x10, #0x4\n"
+      "ldr q5, [x12, #0x20]\n"
+      ".inst 0x6f80e0f0  // udot v16.4s, v7.16b, v0.4b[0]\n"
+      "ldr q4, [x12, #0x30]\n"
+      ".inst 0x6f81e0f4  // udot v20.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f8  // udot v24.4s, v7.16b, v2.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6f83e0fc  // udot v28.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d9  // udot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0dd  // udot v29.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0ba  // udot v26.4s, v5.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0be  // udot v30.4s, v5.16b, v3.4b[0]\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6f81e097  // udot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6f82e09b  // udot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x6f83e09f  // udot v31.4s, v4.16b, v3.4b[0]\n"
       "bge 102b\n"
       "104:"  // Height 4: Multiply loop: Skip odd blocks
-      "cbz x11, 108f\n"
-      "tbz x11, #1, 105f\n"
-      "ldr h0, [x10], #0x2\n"
-      "ldr h1, [x23], #0x2\n"
-      "ldr h2, [x22], #0x2\n"
-      "ldr h3, [x21], #0x2\n"
-      "tbz x11, #0, 106f\n"
-      "ld1 { v0.b }[2], [x10]\n"
-      "ld1 { v1.b }[2], [x23]\n"
-      "ld1 { v2.b }[2], [x22]\n"
-      "ld1 { v3.b }[2], [x21]\n"
+      "cbz x10, 108f\n"
+      "tbz x10, #1, 105f\n"
+      "ldr h0, [x9], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x27], #0x2\n"
+      "ldr h3, [x26], #0x2\n"
+      "tbz x10, #0, 106f\n"
+      "ld1 { v0.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x27]\n"
+      "ld1 { v3.b }[2], [x26]\n"
       "b 106f\n"
       "105:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
-      "ldr b0, [x10, #0x0]\n"
-      "ldr b1, [x23, #0x0]\n"
-      "ldr b2, [x22, #0x0]\n"
-      "ldr b3, [x21, #0x0]\n"
+      "ldr b0, [x9, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x27, #0x0]\n"
+      "ldr b3, [x26, #0x0]\n"
       "106:"  // Height 4: Multiply loop: Ragged operand read: Done
       "tbnz %x[flags], #31, 107f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
@@ -1735,64 +1734,64 @@
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
       "107:"  // Height 4: Multiply loop: unique 16: skip row sum
-      "ldr q10, [x13, #0x0]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      "ldr q4, [x13, #0x10]\n"
-      ".inst 0x6f81e154  // udot v20.4s, v10.16b, v1.4b[0]\n"
-      "ldr q5, [x13, #0x20]\n"
-      ".inst 0x6f82e158  // udot v24.4s, v10.16b, v2.4b[0]\n"
-      "ldr q6, [x13, #0x30]\n"
-      ".inst 0x6f83e15c  // udot v28.4s, v10.16b, v3.4b[0]\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      "add x13, x13, #0x40\n"
-      ".inst 0x6f81e095  // udot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x6f82e099  // udot v25.4s, v4.16b, v2.4b[0]\n"
-      ".inst 0x6f83e09d  // udot v29.4s, v4.16b, v3.4b[0]\n"
+      "ldr q7, [x12, #0x0]\n"
+      ".inst 0x6f80e0f0  // udot v16.4s, v7.16b, v0.4b[0]\n"
+      "ldr q6, [x12, #0x10]\n"
+      ".inst 0x6f81e0f4  // udot v20.4s, v7.16b, v1.4b[0]\n"
+      "ldr q5, [x12, #0x20]\n"
+      ".inst 0x6f82e0f8  // udot v24.4s, v7.16b, v2.4b[0]\n"
+      "ldr q4, [x12, #0x30]\n"
+      ".inst 0x6f83e0fc  // udot v28.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      "add x12, x12, #0x40\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d9  // udot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0dd  // udot v29.4s, v6.16b, v3.4b[0]\n"
       ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
       ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
       ".inst 0x6f82e0ba  // udot v26.4s, v5.16b, v2.4b[0]\n"
       ".inst 0x6f83e0be  // udot v30.4s, v5.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d7  // udot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0db  // udot v27.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0df  // udot v31.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6f81e097  // udot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6f82e09b  // udot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x6f83e09f  // udot v31.4s, v4.16b, v3.4b[0]\n"
       "108:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "add x12, x12, #0x1\n"
-      "cmp x12, x20\n"
+      "add x11, x11, #0x1\n"
+      "cmp x11, x20\n"
       "bne 94b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x14, x20\n"
+      "add x23, x13, x20\n"
+      "add x22, x23, x20\n"
       "add x21, x22, x20\n"
-      "add x20, x21, x20\n"
-      "prfm pstl1keep, [x14, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "prfm pstl1keep, [x22, #0x0]\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
       "tbnz %x[flags], #31, 109f\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
-      "neg v4.4s, v4.4s\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "neg v0.4s, v0.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "mul v11.4s, v11.4s, v4.4s\n"
-      "mul v12.4s, v12.4s, v4.4s\n"
-      "mul v13.4s, v13.4s, v4.4s\n"
-      "mul v14.4s, v14.4s, v4.4s\n"
+      "mul v11.4s, v11.4s, v0.4s\n"
+      "mul v12.4s, v12.4s, v0.4s\n"
+      "mul v13.4s, v13.4s, v0.4s\n"
+      "mul v14.4s, v14.4s, v0.4s\n"
       "109:"  // Height 4: skip row sum fixup
-      "ldr q0, [x16, #0x0]\n"
+      "ldr q3, [x15, #0x0]\n"
       "add v16.4s, v16.4s, v11.4s\n"
-      "ldr q1, [x16, #0x10]\n"
+      "ldr q2, [x15, #0x10]\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x16, #0x20]\n"
+      "ldr q1, [x15, #0x20]\n"
       "add v18.4s, v18.4s, v11.4s\n"
-      "ldr q3, [x16, #0x30]\n"
+      "ldr q0, [x15, #0x30]\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
@@ -1806,93 +1805,93 @@
       "add v29.4s, v29.4s, v14.4s\n"
       "add v30.4s, v30.4s, v14.4s\n"
       "add v31.4s, v31.4s, v14.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "add v28.4s, v28.4s, v0.4s\n"
-      "add v29.4s, v29.4s, v1.4s\n"
-      "add v30.4s, v30.4s, v2.4s\n"
-      "add v31.4s, v31.4s, v3.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add v16.4s, v16.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v2.4s\n"
+      "add v18.4s, v18.4s, v1.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v1.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v2.4s\n"
+      "add v26.4s, v26.4s, v1.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v2.4s\n"
+      "add v30.4s, v30.4s, v1.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
-      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
-      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
-      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
-      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
-      "add x16, x16, #0x40\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v1.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v1.4s\n"
+      "add x15, x15, #0x40\n"
       "tbz %x[flags], #5, 110f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v2.16b, v16.16b, v0.16b\n"
+      "and v1.16b, v17.16b, v0.16b\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v2.4s\n"
+      "sqadd v17.4s, v17.4s, v1.4s\n"
+      "and v7.16b, v18.16b, v0.16b\n"
+      "and v6.16b, v19.16b, v0.16b\n"
+      "and v5.16b, v20.16b, v0.16b\n"
+      "and v4.16b, v21.16b, v0.16b\n"
+      "and v3.16b, v22.16b, v0.16b\n"
+      "and v2.16b, v23.16b, v0.16b\n"
+      "and v1.16b, v24.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "and v9.16b, v28.16b, v0.16b\n"
-      "and v10.16b, v29.16b, v0.16b\n"
-      "and v4.16b, v30.16b, v0.16b\n"
-      "and v5.16b, v31.16b, v0.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
-      "sqadd v28.4s, v28.4s, v9.4s\n"
-      "sqadd v29.4s, v29.4s, v10.4s\n"
-      "sqadd v30.4s, v30.4s, v4.4s\n"
-      "sqadd v31.4s, v31.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v7.4s\n"
+      "sqadd v19.4s, v19.4s, v6.4s\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v4.4s\n"
+      "sqadd v22.4s, v22.4s, v3.4s\n"
+      "sqadd v23.4s, v23.4s, v2.4s\n"
+      "sqadd v24.4s, v24.4s, v1.4s\n"
+      "and v7.16b, v25.16b, v0.16b\n"
+      "and v6.16b, v26.16b, v0.16b\n"
+      "and v5.16b, v27.16b, v0.16b\n"
+      "and v4.16b, v28.16b, v0.16b\n"
+      "and v3.16b, v29.16b, v0.16b\n"
+      "and v2.16b, v30.16b, v0.16b\n"
+      "and v1.16b, v31.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v7.4s\n"
+      "sqadd v26.4s, v26.4s, v6.4s\n"
+      "sqadd v27.4s, v27.4s, v5.4s\n"
+      "sqadd v28.4s, v28.4s, v4.4s\n"
+      "sqadd v29.4s, v29.4s, v3.4s\n"
+      "sqadd v30.4s, v30.4s, v2.4s\n"
+      "sqadd v31.4s, v31.4s, v1.4s\n"
       "110:"  // Height 4: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
@@ -1910,172 +1909,172 @@
       "srshl v29.4s, v29.4s, v0.4s\n"
       "srshl v30.4s, v30.4s, v0.4s\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v0.4s\n"
+      "add v18.4s, v18.4s, v0.4s\n"
+      "add v19.4s, v19.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v0.4s\n"
+      "add v22.4s, v22.4s, v0.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v25.4s, v25.4s, v0.4s\n"
+      "add v26.4s, v26.4s, v0.4s\n"
+      "add v27.4s, v27.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v0.4s\n"
+      "add v29.4s, v29.4s, v0.4s\n"
+      "add v30.4s, v30.4s, v0.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "smin v16.4s, v16.4s, v0.4s\n"
+      "smin v17.4s, v17.4s, v0.4s\n"
+      "smin v18.4s, v18.4s, v0.4s\n"
+      "smin v19.4s, v19.4s, v0.4s\n"
+      "smin v20.4s, v20.4s, v0.4s\n"
+      "smin v21.4s, v21.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v0.4s\n"
+      "smin v23.4s, v23.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v0.4s\n"
+      "smin v25.4s, v25.4s, v0.4s\n"
+      "smin v26.4s, v26.4s, v0.4s\n"
+      "smin v27.4s, v27.4s, v0.4s\n"
+      "smin v28.4s, v28.4s, v0.4s\n"
+      "smin v29.4s, v29.4s, v0.4s\n"
+      "smin v30.4s, v30.4s, v0.4s\n"
+      "smin v31.4s, v31.4s, v0.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "smax v16.4s, v16.4s, v0.4s\n"
+      "smax v17.4s, v17.4s, v0.4s\n"
+      "smax v18.4s, v18.4s, v0.4s\n"
+      "smax v19.4s, v19.4s, v0.4s\n"
+      "smax v20.4s, v20.4s, v0.4s\n"
+      "smax v21.4s, v21.4s, v0.4s\n"
+      "smax v22.4s, v22.4s, v0.4s\n"
+      "smax v23.4s, v23.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v0.4s\n"
+      "smax v25.4s, v25.4s, v0.4s\n"
+      "smax v26.4s, v26.4s, v0.4s\n"
+      "smax v27.4s, v27.4s, v0.4s\n"
+      "smax v28.4s, v28.4s, v0.4s\n"
+      "smax v29.4s, v29.4s, v0.4s\n"
+      "smax v30.4s, v30.4s, v0.4s\n"
+      "smax v31.4s, v31.4s, v0.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v0.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v19.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v18.8h, v26.8h, v27.8h\n"
       "uzp1 v28.8h, v28.8h, v29.8h\n"
-      "uzp1 v29.8h, v30.8h, v31.8h\n"
-      "cmp x15, #0x10\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
-      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "uzp1 v17.8h, v30.8h, v31.8h\n"
+      "cmp x14, #0x10\n"
+      "uzp1 v16.16b, v16.16b, v0.16b\n"
+      "uzp1 v20.16b, v20.16b, v19.16b\n"
+      "uzp1 v24.16b, v24.16b, v18.16b\n"
+      "uzp1 v28.16b, v28.16b, v17.16b\n"
       "bge 119f\n"
-      "tbz x15, #3, 114f\n"
-      "str d16, [x14], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
-      "str d28, [x20], #0x8\n"
-      "tbz x15, #2, 112f\n"
-      "st1 { v16.s }[2], [x14], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
-      "st1 { v28.s }[2], [x20], #0x4\n"
-      "tbz x15, #1, 111f\n"
-      "st1 { v16.h }[6], [x14], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
-      "st1 { v28.h }[6], [x20], #0x2\n"
-      "tbz x15, #0, 118f\n"
-      "st1 { v16.b }[14], [x14]\n"
-      "st1 { v20.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
-      "st1 { v28.b }[14], [x20]\n"
+      "tbz x14, #3, 114f\n"
+      "str d16, [x13], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x14, #2, 112f\n"
+      "st1 { v16.s }[2], [x13], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x14, #1, 111f\n"
+      "st1 { v16.h }[6], [x13], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[14], [x13]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "st1 { v28.b }[14], [x21]\n"
       "b 118f\n"
       "111:"  // Height 4: Partial direct writeback: partial_1_12
-      "tbz x15, #0, 118f\n"
-      "st1 { v16.b }[12], [x14]\n"
-      "st1 { v20.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
-      "st1 { v28.b }[12], [x20]\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[12], [x13]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "st1 { v28.b }[12], [x21]\n"
       "b 118f\n"
       "112:"  // Height 4: Partial direct writeback: partial_2_8
-      "tbz x15, #1, 113f\n"
-      "st1 { v16.h }[4], [x14], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
-      "st1 { v28.h }[4], [x20], #0x2\n"
-      "tbz x15, #0, 118f\n"
-      "st1 { v16.b }[10], [x14]\n"
-      "st1 { v20.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
-      "st1 { v28.b }[10], [x20]\n"
+      "tbz x14, #1, 113f\n"
+      "st1 { v16.h }[4], [x13], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[10], [x13]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "st1 { v28.b }[10], [x21]\n"
       "b 118f\n"
       "113:"  // Height 4: Partial direct writeback: partial_1_8
-      "tbz x15, #0, 118f\n"
-      "st1 { v16.b }[8], [x14]\n"
-      "st1 { v20.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
-      "st1 { v28.b }[8], [x20]\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[8], [x13]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "st1 { v28.b }[8], [x21]\n"
       "b 118f\n"
       "114:"  // Height 4: Partial direct writeback: partial_4_0
-      "tbz x15, #2, 116f\n"
-      "str s16, [x14], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
-      "str s28, [x20], #0x4\n"
-      "tbz x15, #1, 115f\n"
-      "st1 { v16.h }[2], [x14], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
-      "st1 { v28.h }[2], [x20], #0x2\n"
-      "tbz x15, #0, 118f\n"
-      "st1 { v16.b }[6], [x14]\n"
-      "st1 { v20.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
-      "st1 { v28.b }[6], [x20]\n"
+      "tbz x14, #2, 116f\n"
+      "str s16, [x13], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "str s28, [x21], #0x4\n"
+      "tbz x14, #1, 115f\n"
+      "st1 { v16.h }[2], [x13], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[6], [x13]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "st1 { v28.b }[6], [x21]\n"
       "b 118f\n"
       "115:"  // Height 4: Partial direct writeback: partial_1_4
-      "tbz x15, #0, 118f\n"
-      "st1 { v16.b }[4], [x14]\n"
-      "st1 { v20.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
-      "st1 { v28.b }[4], [x20]\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[4], [x13]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "st1 { v28.b }[4], [x21]\n"
       "b 118f\n"
       "116:"  // Height 4: Partial direct writeback: partial_2_0
-      "tbz x15, #1, 117f\n"
-      "str h16, [x14], #0x2\n"
-      "str h20, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
-      "str h28, [x20], #0x2\n"
-      "tbz x15, #0, 118f\n"
-      "st1 { v16.b }[2], [x14]\n"
-      "st1 { v20.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
-      "st1 { v28.b }[2], [x20]\n"
+      "tbz x14, #1, 117f\n"
+      "str h16, [x13], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "str h28, [x21], #0x2\n"
+      "tbz x14, #0, 118f\n"
+      "st1 { v16.b }[2], [x13]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "st1 { v28.b }[2], [x21]\n"
       "b 118f\n"
       "117:"  // Height 4: Partial direct writeback: partial_1_0
-      "str b16, [x14, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
-      "str b28, [x20, #0x0]\n"
+      "str b16, [x13, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
       "118:"  // Height 4: Partial direct writeback: Done
       "b 120f\n"
       "119:"  // Height 4: Full writeback
-      "str q16, [x14, #0x0]\n"
-      "add x14, x14, #0x10\n"
-      "str q20, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
-      "str q28, [x20, #0x0]\n"
+      "str q16, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
       "120:"  // Height 4: Writeback done
-      "subs x15, x15, #0x10\n"
+      "subs x14, x14, #0x10\n"
       "bgt 92b\n"
       "subs %x[M], %x[M], #0x4\n"
       "beq 122f\n"
@@ -2089,10 +2088,9 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "122:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
index 31fbf88..ebe583b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
@@ -78,7 +78,6 @@
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x4\n"
       "bge 91f\n"
@@ -102,11 +101,11 @@
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
       "cbnz x26, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -128,32 +127,32 @@
       "blt 9f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q21, [x28, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q20, [x28, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q26, [x28, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q25, [x28, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q24, [x28, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q23, [x28, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
+      "ldr q22, [x28, #0xd0]\n"
+      ".inst 0x6fa0e2b3  // udot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr q21, [x28, #0xe0]\n"
+      ".inst 0x6f80ea90  // udot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr q20, [x28, #0xf0]\n"
+      ".inst 0x6f80eb51  // udot v17.4s, v26.16b, v0.4b[2]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x6f80eb32  // udot v18.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f80eb13  // udot v19.4s, v24.16b, v0.4b[2]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eaf0  // udot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ead1  // udot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eab2  // udot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea93  // udot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 8f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "8:"  // Height 1: Multiply loop: unique 1: skip row sum
@@ -171,33 +170,33 @@
       "bge 7b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q21, [x28, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q20, [x28, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q26, [x28, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q25, [x28, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q24, [x28, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q23, [x28, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
+      "ldr q22, [x28, #0xd0]\n"
+      ".inst 0x6fa0e2b3  // udot v19.4s, v21.16b, v0.4b[1]\n"
+      "ldr q21, [x28, #0xe0]\n"
+      ".inst 0x6f80ea90  // udot v16.4s, v20.16b, v0.4b[2]\n"
+      "ldr q20, [x28, #0xf0]\n"
+      ".inst 0x6f80eb51  // udot v17.4s, v26.16b, v0.4b[2]\n"
       "sub x25, x25, #0x10\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x6f80eb32  // udot v18.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f80eb13  // udot v19.4s, v24.16b, v0.4b[2]\n"
       "add x24, x24, #0x10\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eaf0  // udot v16.4s, v23.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ead1  // udot v17.4s, v22.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eab2  // udot v18.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea93  // udot v19.4s, v20.16b, v0.4b[3]\n"
       "tbnz %x[flags], #31, 10f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "10:"  // Height 1: Multiply loop: unique 2: skip row sum
@@ -211,16 +210,16 @@
       "tbnz %x[flags], #31, 13f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "13:"  // Height 1: Multiply loop: unique 3: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      "ldr q7, [x28, #0x10]\n"
+      "ldr q23, [x28, #0x0]\n"
+      "ldr q22, [x28, #0x10]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x4\n"
-      "ldr q8, [x28, #0x20]\n"
-      "ldr q9, [x28, #0x30]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
+      "ldr q21, [x28, #0x20]\n"
+      "ldr q20, [x28, #0x30]\n"
+      ".inst 0x6f80e2f0  // udot v16.4s, v23.16b, v0.4b[0]\n"
+      ".inst 0x6f80e2d1  // udot v17.4s, v22.16b, v0.4b[0]\n"
+      ".inst 0x6f80e2b2  // udot v18.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f80e293  // udot v19.4s, v20.16b, v0.4b[0]\n"
       "add x28, x28, #0x40\n"
       "bge 12b\n"
       "14:"  // Height 1: Multiply loop: Skip odd blocks
@@ -236,14 +235,14 @@
       "tbnz %x[flags], #31, 17f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "17:"  // Height 1: Multiply loop: unique 4: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      "ldr q5, [x28, #0x20]\n"
-      "ldr q6, [x28, #0x30]\n"
-      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
+      "ldr q21, [x28, #0x0]\n"
+      "ldr q20, [x28, #0x10]\n"
+      ".inst 0x6f80e2b0  // udot v16.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f80e291  // udot v17.4s, v20.16b, v0.4b[0]\n"
+      "ldr q21, [x28, #0x20]\n"
+      "ldr q20, [x28, #0x30]\n"
+      ".inst 0x6f80e2b2  // udot v18.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f80e293  // udot v19.4s, v20.16b, v0.4b[0]\n"
       "add x28, x28, #0x40\n"
       "18:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -252,72 +251,72 @@
       "bne 4b\n"
       "prfm pstl1keep, [x27, #0x0]\n"
       "tbnz %x[flags], #31, 19f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v1.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v20.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "neg v1.4s, v1.4s\n"
+      "neg v20.4s, v20.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "mul v11.4s, v11.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v20.4s\n"
       "19:"  // Height 1: skip row sum fixup
-      "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q24, [x10, #0x0]\n"
+      "ldr q23, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q22, [x10, #0x20]\n"
+      "ldr q21, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v20.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "add v16.4s, v16.4s, v24.4s\n"
+      "add v17.4s, v17.4s, v23.4s\n"
+      "add v18.4s, v18.4s, v22.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v19.4s, v19.4s, v21.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v20.4s\n"
       "add x10, x10, #0x40\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v20.4s\n"
       "tbz %x[flags], #5, 20f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "and v22.16b, v17.16b, v0.16b\n"
+      "and v21.16b, v18.16b, v0.16b\n"
+      "and v20.16b, v19.16b, v0.16b\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "sqadd v17.4s, v17.4s, v22.4s\n"
+      "sqadd v18.4s, v18.4s, v21.4s\n"
+      "sqadd v19.4s, v19.4s, v20.4s\n"
       "20:"  // Height 1: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v22.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v21.4s }, [x20]\n"
+      "add v16.4s, v16.4s, v22.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "add v18.4s, v18.4s, v22.4s\n"
+      "add v19.4s, v19.4s, v22.4s\n"
       "cmp x9, #0x10\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "smin v16.4s, v16.4s, v21.4s\n"
+      "smin v17.4s, v17.4s, v21.4s\n"
+      "smin v18.4s, v18.4s, v21.4s\n"
+      "smin v19.4s, v19.4s, v21.4s\n"
+      "smax v16.4s, v16.4s, v20.4s\n"
+      "smax v17.4s, v17.4s, v20.4s\n"
+      "smax v18.4s, v18.4s, v20.4s\n"
+      "smax v19.4s, v19.4s, v20.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
       "uzp1 v16.16b, v16.16b, v17.16b\n"
@@ -397,12 +396,12 @@
       "34:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 35f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
       "cbnz x26, 36f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -410,7 +409,7 @@
       "b 36f\n"
       "35:"  // Height 2: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
+      "add x23, x24, x21\n"
       "36:"  // Height 2: input setup done
       "cmp x25, #0x10\n"
       "blt 41f\n"
@@ -428,48 +427,48 @@
       "37:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q25, [x28, #0x70]\n"
       "add x24, x24, #0x10\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q24, [x28, #0x80]\n"
       "add x23, x23, #0x10\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q30, [x28, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x6fa0e333  // udot v19.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e337  // udot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x6f80eb10  // udot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb14  // udot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr q24, [x28, #0xf0]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6f80ebd1  // udot v17.4s, v30.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebd5  // udot v21.4s, v30.16b, v1.4b[2]\n"
+      ".inst 0x6f80ebb2  // udot v18.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebb6  // udot v22.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f80eb93  // udot v19.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb97  // udot v23.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6fa0eb70  // udot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb74  // udot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb51  // udot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb55  // udot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb32  // udot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb36  // udot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb13  // udot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb17  // udot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 38f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
@@ -491,49 +490,49 @@
       "39:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
       ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q25, [x28, #0x70]\n"
       "sub x25, x25, #0x10\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q24, [x28, #0x80]\n"
       "add x24, x24, #0x10\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q30, [x28, #0x90]\n"
       "add x23, x23, #0x10\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x6fa0e333  // udot v19.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e337  // udot v23.4s, v25.16b, v1.4b[1]\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x6f80eb10  // udot v16.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb14  // udot v20.4s, v24.16b, v1.4b[2]\n"
+      "ldr q24, [x28, #0xf0]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6f80ebd1  // udot v17.4s, v30.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebd5  // udot v21.4s, v30.16b, v1.4b[2]\n"
+      ".inst 0x6f80ebb2  // udot v18.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebb6  // udot v22.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f80eb93  // udot v19.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb97  // udot v23.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6fa0eb70  // udot v16.4s, v27.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb74  // udot v20.4s, v27.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb51  // udot v17.4s, v26.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb55  // udot v21.4s, v26.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb32  // udot v18.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb36  // udot v22.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa0eb13  // udot v19.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb17  // udot v23.4s, v24.16b, v1.4b[3]\n"
       "tbnz %x[flags], #31, 40f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
@@ -551,21 +550,21 @@
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       "43:"  // Height 2: Multiply loop: unique 7: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      "ldr q7, [x28, #0x10]\n"
+      "ldr q27, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x4\n"
-      "ldr q8, [x28, #0x20]\n"
-      "ldr q9, [x28, #0x30]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x6f80e370  // udot v16.4s, v27.16b, v0.4b[0]\n"
+      ".inst 0x6f81e374  // udot v20.4s, v27.16b, v1.4b[0]\n"
+      ".inst 0x6f80e351  // udot v17.4s, v26.16b, v0.4b[0]\n"
+      ".inst 0x6f81e355  // udot v21.4s, v26.16b, v1.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
+      ".inst 0x6f80e332  // udot v18.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e336  // udot v22.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x6f80e313  // udot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e317  // udot v23.4s, v24.16b, v1.4b[0]\n"
       "bge 42b\n"
       "44:"  // Height 2: Multiply loop: Skip odd blocks
       "cbz x25, 48f\n"
@@ -584,209 +583,209 @@
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       "47:"  // Height 2: Multiply loop: unique 8: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      ".inst 0x6f81e154  // udot v20.4s, v10.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x20]\n"
-      "ldr q6, [x28, #0x30]\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x6f81e095  // udot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      ".inst 0x6f80e310  // udot v16.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e314  // udot v20.4s, v24.16b, v1.4b[0]\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x6f80e351  // udot v17.4s, v26.16b, v0.4b[0]\n"
+      ".inst 0x6f81e355  // udot v21.4s, v26.16b, v1.4b[0]\n"
+      ".inst 0x6f80e332  // udot v18.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e336  // udot v22.4s, v25.16b, v1.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d7  // udot v23.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f80e313  // udot v19.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e317  // udot v23.4s, v24.16b, v1.4b[0]\n"
       "48:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
       "cmp x26, x20\n"
       "bne 34b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x27, x20\n"
+      "add x23, x27, x20\n"
       "prfm pstl1keep, [x27, #0x0]\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "tbnz %x[flags], #31, 49f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v2.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "neg v2.4s, v2.4s\n"
+      "neg v24.4s, v24.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
-      "mul v11.4s, v11.4s, v2.4s\n"
-      "mul v12.4s, v12.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v24.4s\n"
+      "mul v12.4s, v12.4s, v24.4s\n"
       "49:"  // Height 2: skip row sum fixup
-      "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q28, [x10, #0x0]\n"
+      "ldr q27, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q26, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "add v17.4s, v17.4s, v27.4s\n"
       "add x10, x10, #0x40\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "add v20.4s, v20.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v25.4s\n"
+      "add v20.4s, v20.4s, v28.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v21.4s, v21.4s, v27.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v23.4s, v23.4s, v25.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v24.4s\n"
       "tbz %x[flags], #5, 50f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v24.16b, v16.16b, v0.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v24.4s\n"
+      "and v30.16b, v17.16b, v0.16b\n"
+      "and v29.16b, v18.16b, v0.16b\n"
+      "and v28.16b, v19.16b, v0.16b\n"
+      "and v27.16b, v20.16b, v0.16b\n"
+      "and v26.16b, v21.16b, v0.16b\n"
+      "and v25.16b, v22.16b, v0.16b\n"
+      "and v24.16b, v23.16b, v0.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v30.4s\n"
+      "sqadd v18.4s, v18.4s, v29.4s\n"
+      "sqadd v19.4s, v19.4s, v28.4s\n"
+      "sqadd v20.4s, v20.4s, v27.4s\n"
+      "sqadd v21.4s, v21.4s, v26.4s\n"
+      "sqadd v22.4s, v22.4s, v25.4s\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
       "50:"  // Height 2: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v25.4s }, [x20]\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
       "cmp x9, #0x10\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v16.4s, v16.4s, v26.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v26.4s\n"
+      "add v20.4s, v20.4s, v26.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v23.4s, v23.4s, v26.4s\n"
+      "smin v16.4s, v16.4s, v25.4s\n"
+      "smin v17.4s, v17.4s, v25.4s\n"
+      "smin v18.4s, v18.4s, v25.4s\n"
+      "smin v19.4s, v19.4s, v25.4s\n"
+      "smin v20.4s, v20.4s, v25.4s\n"
+      "smin v21.4s, v21.4s, v25.4s\n"
+      "smin v22.4s, v22.4s, v25.4s\n"
+      "smin v23.4s, v23.4s, v25.4s\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v17.8h, v22.8h, v23.8h\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v20.16b, v20.16b, v17.16b\n"
       "bge 59f\n"
       "tbz x9, #3, 54f\n"
       "str d16, [x27], #0x8\n"
-      "str d20, [x22], #0x8\n"
+      "str d20, [x23], #0x8\n"
       "tbz x9, #2, 52f\n"
       "st1 { v16.s }[2], [x27], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
       "tbz x9, #1, 51f\n"
       "st1 { v16.h }[6], [x27], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
       "st1 { v16.b }[14], [x27]\n"
-      "st1 { v20.b }[14], [x22]\n"
+      "st1 { v20.b }[14], [x23]\n"
       "b 58f\n"
       "51:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x9, #0, 58f\n"
       "st1 { v16.b }[12], [x27]\n"
-      "st1 { v20.b }[12], [x22]\n"
+      "st1 { v20.b }[12], [x23]\n"
       "b 58f\n"
       "52:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x9, #1, 53f\n"
       "st1 { v16.h }[4], [x27], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
       "st1 { v16.b }[10], [x27]\n"
-      "st1 { v20.b }[10], [x22]\n"
+      "st1 { v20.b }[10], [x23]\n"
       "b 58f\n"
       "53:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x9, #0, 58f\n"
       "st1 { v16.b }[8], [x27]\n"
-      "st1 { v20.b }[8], [x22]\n"
+      "st1 { v20.b }[8], [x23]\n"
       "b 58f\n"
       "54:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x9, #2, 56f\n"
       "str s16, [x27], #0x4\n"
-      "str s20, [x22], #0x4\n"
+      "str s20, [x23], #0x4\n"
       "tbz x9, #1, 55f\n"
       "st1 { v16.h }[2], [x27], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
       "st1 { v16.b }[6], [x27]\n"
-      "st1 { v20.b }[6], [x22]\n"
+      "st1 { v20.b }[6], [x23]\n"
       "b 58f\n"
       "55:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x9, #0, 58f\n"
       "st1 { v16.b }[4], [x27]\n"
-      "st1 { v20.b }[4], [x22]\n"
+      "st1 { v20.b }[4], [x23]\n"
       "b 58f\n"
       "56:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x9, #1, 57f\n"
       "str h16, [x27], #0x2\n"
-      "str h20, [x22], #0x2\n"
+      "str h20, [x23], #0x2\n"
       "tbz x9, #0, 58f\n"
       "st1 { v16.b }[2], [x27]\n"
-      "st1 { v20.b }[2], [x22]\n"
+      "st1 { v20.b }[2], [x23]\n"
       "b 58f\n"
       "57:"  // Height 2: Partial direct writeback: partial_1_0
       "str b16, [x27, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
       "58:"  // Height 2: Partial direct writeback: Done
       "b 60f\n"
       "59:"  // Height 2: Full writeback
       "str q16, [x27, #0x0]\n"
       "add x27, x27, #0x10\n"
-      "str q20, [x22, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
       "60:"  // Height 2: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 32b\n"
@@ -819,13 +818,13 @@
       "64:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 65f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
       "cbnz x26, 66f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -834,8 +833,8 @@
       "b 66f\n"
       "65:"  // Height 3: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "66:"  // Height 3: input setup done
       "cmp x25, #0x10\n"
       "blt 71f\n"
@@ -857,62 +856,62 @@
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q29, [x28, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
       "add x22, x22, #0x10\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
       ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q28, [x28, #0x80]\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
       ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q5, [x28, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q4, [x28, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
       ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q3, [x28, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q31, [x28, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
+      "ldr q30, [x28, #0xd0]\n"
+      ".inst 0x6fa0e3b3  // udot v19.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3b7  // udot v23.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3bb  // udot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr q29, [x28, #0xe0]\n"
+      ".inst 0x6f80eb90  // udot v16.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb94  // udot v20.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb98  // udot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr q28, [x28, #0xf0]\n"
+      ".inst 0x6f80e8b1  // udot v17.4s, v5.16b, v0.4b[2]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x6f82e91b  // udot v27.4s, v8.16b, v2.4b[2]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e938  // udot v24.4s, v9.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e959  // udot v25.4s, v10.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e89a  // udot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8bb  // udot v27.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x6f81e8b5  // udot v21.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8b9  // udot v25.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x6f80e892  // udot v18.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x6f81e896  // udot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6f82e89a  // udot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x6f80e873  // udot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x6f81e877  // udot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x6f82e87b  // udot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x6fa0ebf0  // udot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebf4  // udot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebf8  // udot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebd1  // udot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebd5  // udot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebd9  // udot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebb2  // udot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebb6  // udot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebba  // udot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa0eb93  // udot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb97  // udot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb9b  // udot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 68f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
@@ -940,63 +939,63 @@
       "sub x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
-      "ldr q4, [x28, #0x70]\n"
+      "ldr q29, [x28, #0x70]\n"
       ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
       "add x23, x23, #0x10\n"
       ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
       ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
-      "ldr q5, [x28, #0x80]\n"
+      "ldr q28, [x28, #0x80]\n"
       "add x22, x22, #0x10\n"
       ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
       ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x28, #0x90]\n"
+      "ldr q5, [x28, #0x90]\n"
       ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x28, #0xa0]\n"
+      "ldr q4, [x28, #0xa0]\n"
       ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
       ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
       ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
-      "ldr q8, [x28, #0xb0]\n"
+      "ldr q3, [x28, #0xb0]\n"
       ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
       ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
       ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
-      "ldr q9, [x28, #0xc0]\n"
+      "ldr q31, [x28, #0xc0]\n"
       ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
       ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
       ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
-      "ldr q10, [x28, #0xd0]\n"
-      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
-      "ldr q4, [x28, #0xe0]\n"
-      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
-      "ldr q5, [x28, #0xf0]\n"
-      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
+      "ldr q30, [x28, #0xd0]\n"
+      ".inst 0x6fa0e3b3  // udot v19.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3b7  // udot v23.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3bb  // udot v27.4s, v29.16b, v2.4b[1]\n"
+      "ldr q29, [x28, #0xe0]\n"
+      ".inst 0x6f80eb90  // udot v16.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb94  // udot v20.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb98  // udot v24.4s, v28.16b, v2.4b[2]\n"
+      "ldr q28, [x28, #0xf0]\n"
+      ".inst 0x6f80e8b1  // udot v17.4s, v5.16b, v0.4b[2]\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
-      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
-      ".inst 0x6f82e91b  // udot v27.4s, v8.16b, v2.4b[2]\n"
-      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e938  // udot v24.4s, v9.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e959  // udot v25.4s, v10.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e89a  // udot v26.4s, v4.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8bb  // udot v27.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x6f81e8b5  // udot v21.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8b9  // udot v25.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x6f80e892  // udot v18.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x6f81e896  // udot v22.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6f82e89a  // udot v26.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x6f80e873  // udot v19.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x6f81e877  // udot v23.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x6f82e87b  // udot v27.4s, v3.16b, v2.4b[2]\n"
+      ".inst 0x6fa0ebf0  // udot v16.4s, v31.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebf4  // udot v20.4s, v31.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebf8  // udot v24.4s, v31.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebd1  // udot v17.4s, v30.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebd5  // udot v21.4s, v30.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebd9  // udot v25.4s, v30.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ebb2  // udot v18.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebb6  // udot v22.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebba  // udot v26.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa0eb93  // udot v19.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb97  // udot v23.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb9b  // udot v27.4s, v28.16b, v2.4b[3]\n"
       "tbnz %x[flags], #31, 70f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
@@ -1018,25 +1017,25 @@
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "73:"  // Height 3: Multiply loop: unique 11: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      "ldr q7, [x28, #0x10]\n"
+      "ldr q31, [x28, #0x0]\n"
+      "ldr q30, [x28, #0x10]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x4\n"
-      "ldr q8, [x28, #0x20]\n"
-      "ldr q9, [x28, #0x30]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d8  // udot v24.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q29, [x28, #0x20]\n"
+      "ldr q28, [x28, #0x30]\n"
+      ".inst 0x6f80e3f0  // udot v16.4s, v31.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3f4  // udot v20.4s, v31.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3f8  // udot v24.4s, v31.16b, v2.4b[0]\n"
+      ".inst 0x6f80e3d1  // udot v17.4s, v30.16b, v0.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f9  // udot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x6f82e11a  // udot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x6f82e13b  // udot v27.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x6f81e3d5  // udot v21.4s, v30.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3d9  // udot v25.4s, v30.16b, v2.4b[0]\n"
+      ".inst 0x6f80e3b2  // udot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3b6  // udot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3ba  // udot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f80e393  // udot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e397  // udot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e39b  // udot v27.4s, v28.16b, v2.4b[0]\n"
       "bge 72b\n"
       "74:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x25, 78f\n"
@@ -1059,144 +1058,144 @@
       ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "77:"  // Height 3: Multiply loop: unique 12: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      ".inst 0x6f81e154  // udot v20.4s, v10.16b, v1.4b[0]\n"
-      "ldr q5, [x28, #0x20]\n"
-      "ldr q6, [x28, #0x30]\n"
-      ".inst 0x6f82e158  // udot v24.4s, v10.16b, v2.4b[0]\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x6f81e095  // udot v21.4s, v4.16b, v1.4b[0]\n"
-      ".inst 0x6f82e099  // udot v25.4s, v4.16b, v2.4b[0]\n"
+      "ldr q31, [x28, #0x0]\n"
+      "ldr q30, [x28, #0x10]\n"
+      ".inst 0x6f80e3f0  // udot v16.4s, v31.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3f4  // udot v20.4s, v31.16b, v1.4b[0]\n"
+      "ldr q29, [x28, #0x20]\n"
+      "ldr q28, [x28, #0x30]\n"
+      ".inst 0x6f82e3f8  // udot v24.4s, v31.16b, v2.4b[0]\n"
+      ".inst 0x6f80e3d1  // udot v17.4s, v30.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3d5  // udot v21.4s, v30.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3d9  // udot v25.4s, v30.16b, v2.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0ba  // udot v26.4s, v5.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d7  // udot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0db  // udot v27.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f80e3b2  // udot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3b6  // udot v22.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3ba  // udot v26.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f80e393  // udot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e397  // udot v23.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e39b  // udot v27.4s, v28.16b, v2.4b[0]\n"
       "78:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
       "cmp x26, x20\n"
       "bne 64b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x27, x20\n"
-      "add x21, x22, x20\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
       "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "tbnz %x[flags], #31, 79f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v3.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "neg v3.4s, v3.4s\n"
+      "neg v28.4s, v28.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "mul v11.4s, v11.4s, v3.4s\n"
-      "mul v12.4s, v12.4s, v3.4s\n"
-      "mul v13.4s, v13.4s, v3.4s\n"
+      "mul v11.4s, v11.4s, v28.4s\n"
+      "mul v12.4s, v12.4s, v28.4s\n"
+      "mul v13.4s, v13.4s, v28.4s\n"
       "79:"  // Height 3: skip row sum fixup
       "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q31, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q30, [x10, #0x20]\n"
+      "ldr q29, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add v24.4s, v24.4s, v13.4s\n"
       "add v25.4s, v25.4s, v13.4s\n"
       "add x10, x10, #0x40\n"
       "add v26.4s, v26.4s, v13.4s\n"
       "add v27.4s, v27.4s, v13.4s\n"
       "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v31.4s\n"
+      "add v18.4s, v18.4s, v30.4s\n"
+      "add v19.4s, v19.4s, v29.4s\n"
       "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v31.4s\n"
+      "add v22.4s, v22.4s, v30.4s\n"
+      "add v23.4s, v23.4s, v29.4s\n"
       "add v24.4s, v24.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v31.4s\n"
+      "add v26.4s, v26.4s, v30.4s\n"
+      "add v27.4s, v27.4s, v29.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v28.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v28.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v28.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v28.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v28.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v28.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v28.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v28.4s\n"
       "tbz %x[flags], #5, 80f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "and v1.16b, v16.16b, v0.16b\n"
+      "and v31.16b, v17.16b, v0.16b\n"
+      "and v30.16b, v18.16b, v0.16b\n"
+      "and v29.16b, v19.16b, v0.16b\n"
+      "and v28.16b, v20.16b, v0.16b\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v1.4s\n"
+      "sqadd v17.4s, v17.4s, v31.4s\n"
+      "sqadd v18.4s, v18.4s, v30.4s\n"
+      "sqadd v19.4s, v19.4s, v29.4s\n"
+      "sqadd v20.4s, v20.4s, v28.4s\n"
+      "and v3.16b, v21.16b, v0.16b\n"
+      "and v2.16b, v22.16b, v0.16b\n"
+      "and v1.16b, v23.16b, v0.16b\n"
+      "and v31.16b, v24.16b, v0.16b\n"
+      "and v30.16b, v25.16b, v0.16b\n"
+      "and v29.16b, v26.16b, v0.16b\n"
+      "and v28.16b, v27.16b, v0.16b\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v31.4s, v31.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v3.4s\n"
+      "sqadd v22.4s, v22.4s, v2.4s\n"
+      "sqadd v23.4s, v23.4s, v1.4s\n"
+      "sqadd v24.4s, v24.4s, v31.4s\n"
+      "sqadd v25.4s, v25.4s, v30.4s\n"
+      "sqadd v26.4s, v26.4s, v29.4s\n"
+      "sqadd v27.4s, v27.4s, v28.4s\n"
       "80:"  // Height 3: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v30.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v29.4s }, [x20]\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
       "cmp x9, #0x10\n"
@@ -1204,132 +1203,132 @@
       "srshl v25.4s, v25.4s, v0.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v16.4s, v16.4s, v30.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v30.4s\n"
+      "add v19.4s, v19.4s, v30.4s\n"
+      "add v20.4s, v20.4s, v30.4s\n"
+      "add v21.4s, v21.4s, v30.4s\n"
+      "add v22.4s, v22.4s, v30.4s\n"
+      "add v23.4s, v23.4s, v30.4s\n"
+      "add v24.4s, v24.4s, v30.4s\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v30.4s\n"
+      "add v27.4s, v27.4s, v30.4s\n"
+      "smin v16.4s, v16.4s, v29.4s\n"
+      "smin v17.4s, v17.4s, v29.4s\n"
+      "smin v18.4s, v18.4s, v29.4s\n"
+      "smin v19.4s, v19.4s, v29.4s\n"
+      "smin v20.4s, v20.4s, v29.4s\n"
+      "smin v21.4s, v21.4s, v29.4s\n"
+      "smin v22.4s, v22.4s, v29.4s\n"
+      "smin v23.4s, v23.4s, v29.4s\n"
+      "smin v24.4s, v24.4s, v29.4s\n"
+      "smin v25.4s, v25.4s, v29.4s\n"
+      "smin v26.4s, v26.4s, v29.4s\n"
+      "smin v27.4s, v27.4s, v29.4s\n"
+      "smax v16.4s, v16.4s, v28.4s\n"
+      "smax v17.4s, v17.4s, v28.4s\n"
+      "smax v18.4s, v18.4s, v28.4s\n"
+      "smax v19.4s, v19.4s, v28.4s\n"
+      "smax v20.4s, v20.4s, v28.4s\n"
+      "smax v21.4s, v21.4s, v28.4s\n"
+      "smax v22.4s, v22.4s, v28.4s\n"
+      "smax v23.4s, v23.4s, v28.4s\n"
+      "smax v24.4s, v24.4s, v28.4s\n"
+      "smax v25.4s, v25.4s, v28.4s\n"
+      "smax v26.4s, v26.4s, v28.4s\n"
+      "smax v27.4s, v27.4s, v28.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v18.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v20.16b, v20.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 89f\n"
       "tbz x9, #3, 84f\n"
       "str d16, [x27], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
       "tbz x9, #2, 82f\n"
       "st1 { v16.s }[2], [x27], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
       "tbz x9, #1, 81f\n"
       "st1 { v16.h }[6], [x27], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
       "st1 { v16.b }[14], [x27]\n"
-      "st1 { v20.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 88f\n"
       "81:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x9, #0, 88f\n"
       "st1 { v16.b }[12], [x27]\n"
-      "st1 { v20.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 88f\n"
       "82:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x9, #1, 83f\n"
       "st1 { v16.h }[4], [x27], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
       "st1 { v16.b }[10], [x27]\n"
-      "st1 { v20.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 88f\n"
       "83:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x9, #0, 88f\n"
       "st1 { v16.b }[8], [x27]\n"
-      "st1 { v20.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 88f\n"
       "84:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x9, #2, 86f\n"
       "str s16, [x27], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
       "tbz x9, #1, 85f\n"
       "st1 { v16.h }[2], [x27], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
       "st1 { v16.b }[6], [x27]\n"
-      "st1 { v20.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 88f\n"
       "85:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x9, #0, 88f\n"
       "st1 { v16.b }[4], [x27]\n"
-      "st1 { v20.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 88f\n"
       "86:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x9, #1, 87f\n"
       "str h16, [x27], #0x2\n"
-      "str h20, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
       "tbz x9, #0, 88f\n"
       "st1 { v16.b }[2], [x27]\n"
-      "st1 { v20.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 88f\n"
       "87:"  // Height 3: Partial direct writeback: partial_1_0
       "str b16, [x27, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "88:"  // Height 3: Partial direct writeback: Done
       "b 90f\n"
       "89:"  // Height 3: Full writeback
       "str q16, [x27, #0x0]\n"
       "add x27, x27, #0x10\n"
-      "str q20, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "90:"  // Height 3: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 62b\n"
@@ -1370,14 +1369,14 @@
       "94:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 95f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
       "cbnz x26, 96f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -1387,9 +1386,9 @@
       "b 96f\n"
       "95:"  // Height 4: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "96:"  // Height 4: input setup done
       "cmp x25, #0x10\n"
       "blt 101f\n"
@@ -1614,29 +1613,29 @@
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
       "103:"  // Height 4: Multiply loop: unique 15: skip row sum
-      "ldr q6, [x28, #0x0]\n"
-      "ldr q7, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x4\n"
-      "ldr q8, [x28, #0x20]\n"
-      "ldr q9, [x28, #0x30]\n"
-      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d8  // udot v24.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0dc  // udot v28.4s, v6.16b, v3.4b[0]\n"
+      "ldr q5, [x28, #0x20]\n"
+      "ldr q4, [x28, #0x30]\n"
+      ".inst 0x6f80e0f0  // udot v16.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0f4  // udot v20.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f8  // udot v24.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0fc  // udot v28.4s, v7.16b, v3.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f9  // udot v25.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0fd  // udot v29.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
-      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
-      ".inst 0x6f82e11a  // udot v26.4s, v8.16b, v2.4b[0]\n"
-      ".inst 0x6f83e11e  // udot v30.4s, v8.16b, v3.4b[0]\n"
-      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
-      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
-      ".inst 0x6f82e13b  // udot v27.4s, v9.16b, v2.4b[0]\n"
-      ".inst 0x6f83e13f  // udot v31.4s, v9.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d9  // udot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0dd  // udot v29.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0ba  // udot v26.4s, v5.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0be  // udot v30.4s, v5.16b, v3.4b[0]\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6f81e097  // udot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6f82e09b  // udot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x6f83e09f  // udot v31.4s, v4.16b, v3.4b[0]\n"
       "bge 102b\n"
       "104:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x25, 108f\n"
@@ -1663,73 +1662,73 @@
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
       "107:"  // Height 4: Multiply loop: unique 16: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x6f80e150  // udot v16.4s, v10.16b, v0.4b[0]\n"
-      ".inst 0x6f81e154  // udot v20.4s, v10.16b, v1.4b[0]\n"
+      "ldr q7, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      ".inst 0x6f80e0f0  // udot v16.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0f4  // udot v20.4s, v7.16b, v1.4b[0]\n"
       "ldr q5, [x28, #0x20]\n"
-      "ldr q6, [x28, #0x30]\n"
-      ".inst 0x6f82e158  // udot v24.4s, v10.16b, v2.4b[0]\n"
-      ".inst 0x6f83e15c  // udot v28.4s, v10.16b, v3.4b[0]\n"
-      ".inst 0x6f80e091  // udot v17.4s, v4.16b, v0.4b[0]\n"
-      ".inst 0x6f81e095  // udot v21.4s, v4.16b, v1.4b[0]\n"
+      "ldr q4, [x28, #0x30]\n"
+      ".inst 0x6f82e0f8  // udot v24.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0fc  // udot v28.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
       "add x28, x28, #0x40\n"
-      ".inst 0x6f82e099  // udot v25.4s, v4.16b, v2.4b[0]\n"
-      ".inst 0x6f83e09d  // udot v29.4s, v4.16b, v3.4b[0]\n"
+      ".inst 0x6f82e0d9  // udot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0dd  // udot v29.4s, v6.16b, v3.4b[0]\n"
       ".inst 0x6f80e0b2  // udot v18.4s, v5.16b, v0.4b[0]\n"
       ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
       ".inst 0x6f82e0ba  // udot v26.4s, v5.16b, v2.4b[0]\n"
       ".inst 0x6f83e0be  // udot v30.4s, v5.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0d3  // udot v19.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0d7  // udot v23.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0db  // udot v27.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0df  // udot v31.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6f81e097  // udot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6f82e09b  // udot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x6f83e09f  // udot v31.4s, v4.16b, v3.4b[0]\n"
       "108:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
       "cmp x26, x20\n"
       "bne 94b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x22, x27, x20\n"
-      "add x21, x22, x20\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
       "prfm pstl1keep, [x27, #0x0]\n"
-      "add x20, x21, x20\n"
+      "add x21, x22, x20\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "prfm pstl1keep, [x22, #0x0]\n"
       "prfm pstl1keep, [x21, #0x0]\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
       "tbnz %x[flags], #31, 109f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "neg v4.4s, v4.4s\n"
+      "neg v0.4s, v0.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "mul v11.4s, v11.4s, v4.4s\n"
-      "mul v12.4s, v12.4s, v4.4s\n"
-      "mul v13.4s, v13.4s, v4.4s\n"
-      "mul v14.4s, v14.4s, v4.4s\n"
+      "mul v11.4s, v11.4s, v0.4s\n"
+      "mul v12.4s, v12.4s, v0.4s\n"
+      "mul v13.4s, v13.4s, v0.4s\n"
+      "mul v14.4s, v14.4s, v0.4s\n"
       "109:"  // Height 4: skip row sum fixup
       "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q4, [x10, #0x10]\n"
       "add v16.4s, v16.4s, v11.4s\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q3, [x10, #0x20]\n"
+      "ldr q2, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
       "add v19.4s, v19.4s, v11.4s\n"
       "add v20.4s, v20.4s, v12.4s\n"
       "add v21.4s, v21.4s, v12.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "add v22.4s, v22.4s, v12.4s\n"
       "add v23.4s, v23.4s, v12.4s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add v24.4s, v24.4s, v13.4s\n"
       "add v25.4s, v25.4s, v13.4s\n"
       "add x10, x10, #0x40\n"
@@ -1740,100 +1739,100 @@
       "add v30.4s, v30.4s, v14.4s\n"
       "add v31.4s, v31.4s, v14.4s\n"
       "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v2.4s\n"
       "add v20.4s, v20.4s, v0.4s\n"
-      "add v21.4s, v21.4s, v1.4s\n"
-      "add v22.4s, v22.4s, v2.4s\n"
-      "add v23.4s, v23.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v2.4s\n"
       "add v24.4s, v24.4s, v0.4s\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v2.4s\n"
       "add v28.4s, v28.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v29.4s, v29.4s, v1.4s\n"
-      "add v30.4s, v30.4s, v2.4s\n"
-      "add v31.4s, v31.4s, v3.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
-      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
-      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
-      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
-      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v31.4s, v31.4s, v2.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v1.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v1.4s\n"
       "tbz %x[flags], #5, 110f\n"
-      "and v4.16b, v16.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "and v8.16b, v20.16b, v0.16b\n"
-      "and v9.16b, v21.16b, v0.16b\n"
-      "and v10.16b, v22.16b, v0.16b\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v2.16b, v16.16b, v0.16b\n"
+      "and v1.16b, v17.16b, v0.16b\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v2.4s\n"
+      "sqadd v17.4s, v17.4s, v1.4s\n"
+      "and v7.16b, v18.16b, v0.16b\n"
+      "and v6.16b, v19.16b, v0.16b\n"
+      "and v5.16b, v20.16b, v0.16b\n"
+      "and v4.16b, v21.16b, v0.16b\n"
+      "and v3.16b, v22.16b, v0.16b\n"
+      "and v2.16b, v23.16b, v0.16b\n"
+      "and v1.16b, v24.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
-      "sqadd v20.4s, v20.4s, v8.4s\n"
-      "sqadd v21.4s, v21.4s, v9.4s\n"
-      "sqadd v22.4s, v22.4s, v10.4s\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "and v9.16b, v28.16b, v0.16b\n"
-      "and v10.16b, v29.16b, v0.16b\n"
-      "and v4.16b, v30.16b, v0.16b\n"
-      "and v5.16b, v31.16b, v0.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
-      "sqadd v28.4s, v28.4s, v9.4s\n"
-      "sqadd v29.4s, v29.4s, v10.4s\n"
-      "sqadd v30.4s, v30.4s, v4.4s\n"
-      "sqadd v31.4s, v31.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v7.4s\n"
+      "sqadd v19.4s, v19.4s, v6.4s\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v4.4s\n"
+      "sqadd v22.4s, v22.4s, v3.4s\n"
+      "sqadd v23.4s, v23.4s, v2.4s\n"
+      "sqadd v24.4s, v24.4s, v1.4s\n"
+      "and v7.16b, v25.16b, v0.16b\n"
+      "and v6.16b, v26.16b, v0.16b\n"
+      "and v5.16b, v27.16b, v0.16b\n"
+      "and v4.16b, v28.16b, v0.16b\n"
+      "and v3.16b, v29.16b, v0.16b\n"
+      "and v2.16b, v30.16b, v0.16b\n"
+      "and v1.16b, v31.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v7.4s\n"
+      "sqadd v26.4s, v26.4s, v6.4s\n"
+      "sqadd v27.4s, v27.4s, v5.4s\n"
+      "sqadd v28.4s, v28.4s, v4.4s\n"
+      "sqadd v29.4s, v29.4s, v3.4s\n"
+      "sqadd v30.4s, v30.4s, v2.4s\n"
+      "sqadd v31.4s, v31.4s, v1.4s\n"
       "110:"  // Height 4: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v3.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v2.4s }, [x20]\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
       "cmp x9, #0x10\n"
@@ -1845,163 +1844,163 @@
       "srshl v29.4s, v29.4s, v0.4s\n"
       "srshl v30.4s, v30.4s, v0.4s\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
+      "add v16.4s, v16.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v3.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v3.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v3.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v3.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v31.4s, v31.4s, v3.4s\n"
+      "smin v16.4s, v16.4s, v2.4s\n"
+      "smin v17.4s, v17.4s, v2.4s\n"
+      "smin v18.4s, v18.4s, v2.4s\n"
+      "smin v19.4s, v19.4s, v2.4s\n"
+      "smin v20.4s, v20.4s, v2.4s\n"
+      "smin v21.4s, v21.4s, v2.4s\n"
+      "smin v22.4s, v22.4s, v2.4s\n"
+      "smin v23.4s, v23.4s, v2.4s\n"
+      "smin v24.4s, v24.4s, v2.4s\n"
+      "smin v25.4s, v25.4s, v2.4s\n"
+      "smin v26.4s, v26.4s, v2.4s\n"
+      "smin v27.4s, v27.4s, v2.4s\n"
+      "smin v28.4s, v28.4s, v2.4s\n"
+      "smin v29.4s, v29.4s, v2.4s\n"
+      "smin v30.4s, v30.4s, v2.4s\n"
+      "smin v31.4s, v31.4s, v2.4s\n"
+      "smax v16.4s, v16.4s, v1.4s\n"
+      "smax v17.4s, v17.4s, v1.4s\n"
+      "smax v18.4s, v18.4s, v1.4s\n"
+      "smax v19.4s, v19.4s, v1.4s\n"
+      "smax v20.4s, v20.4s, v1.4s\n"
+      "smax v21.4s, v21.4s, v1.4s\n"
+      "smax v22.4s, v22.4s, v1.4s\n"
+      "smax v23.4s, v23.4s, v1.4s\n"
+      "smax v24.4s, v24.4s, v1.4s\n"
+      "smax v25.4s, v25.4s, v1.4s\n"
+      "smax v26.4s, v26.4s, v1.4s\n"
+      "smax v27.4s, v27.4s, v1.4s\n"
+      "smax v28.4s, v28.4s, v1.4s\n"
+      "smax v29.4s, v29.4s, v1.4s\n"
+      "smax v30.4s, v30.4s, v1.4s\n"
+      "smax v31.4s, v31.4s, v1.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v0.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v19.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v18.8h, v26.8h, v27.8h\n"
       "uzp1 v28.8h, v28.8h, v29.8h\n"
-      "uzp1 v29.8h, v30.8h, v31.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v20.16b, v20.16b, v21.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
-      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "uzp1 v17.8h, v30.8h, v31.8h\n"
+      "uzp1 v16.16b, v16.16b, v0.16b\n"
+      "uzp1 v20.16b, v20.16b, v19.16b\n"
+      "uzp1 v24.16b, v24.16b, v18.16b\n"
+      "uzp1 v28.16b, v28.16b, v17.16b\n"
       "bge 119f\n"
       "tbz x9, #3, 114f\n"
       "str d16, [x27], #0x8\n"
-      "str d20, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
-      "str d28, [x20], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
       "tbz x9, #2, 112f\n"
       "st1 { v16.s }[2], [x27], #0x4\n"
-      "st1 { v20.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
-      "st1 { v28.s }[2], [x20], #0x4\n"
+      "st1 { v20.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
       "tbz x9, #1, 111f\n"
       "st1 { v16.h }[6], [x27], #0x2\n"
-      "st1 { v20.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
-      "st1 { v28.h }[6], [x20], #0x2\n"
+      "st1 { v20.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
       "st1 { v16.b }[14], [x27]\n"
-      "st1 { v20.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
-      "st1 { v28.b }[14], [x20]\n"
+      "st1 { v20.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
+      "st1 { v28.b }[14], [x21]\n"
       "b 118f\n"
       "111:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x9, #0, 118f\n"
       "st1 { v16.b }[12], [x27]\n"
-      "st1 { v20.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
-      "st1 { v28.b }[12], [x20]\n"
+      "st1 { v20.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
+      "st1 { v28.b }[12], [x21]\n"
       "b 118f\n"
       "112:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x9, #1, 113f\n"
       "st1 { v16.h }[4], [x27], #0x2\n"
-      "st1 { v20.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
-      "st1 { v28.h }[4], [x20], #0x2\n"
+      "st1 { v20.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
       "st1 { v16.b }[10], [x27]\n"
-      "st1 { v20.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
-      "st1 { v28.b }[10], [x20]\n"
+      "st1 { v20.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
+      "st1 { v28.b }[10], [x21]\n"
       "b 118f\n"
       "113:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x9, #0, 118f\n"
       "st1 { v16.b }[8], [x27]\n"
-      "st1 { v20.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
-      "st1 { v28.b }[8], [x20]\n"
+      "st1 { v20.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
+      "st1 { v28.b }[8], [x21]\n"
       "b 118f\n"
       "114:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x9, #2, 116f\n"
       "str s16, [x27], #0x4\n"
-      "str s20, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
-      "str s28, [x20], #0x4\n"
+      "str s20, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
+      "str s28, [x21], #0x4\n"
       "tbz x9, #1, 115f\n"
       "st1 { v16.h }[2], [x27], #0x2\n"
-      "st1 { v20.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
-      "st1 { v28.h }[2], [x20], #0x2\n"
+      "st1 { v20.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
       "st1 { v16.b }[6], [x27]\n"
-      "st1 { v20.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
-      "st1 { v28.b }[6], [x20]\n"
+      "st1 { v20.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
+      "st1 { v28.b }[6], [x21]\n"
       "b 118f\n"
       "115:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x9, #0, 118f\n"
       "st1 { v16.b }[4], [x27]\n"
-      "st1 { v20.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
-      "st1 { v28.b }[4], [x20]\n"
+      "st1 { v20.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
+      "st1 { v28.b }[4], [x21]\n"
       "b 118f\n"
       "116:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x9, #1, 117f\n"
       "str h16, [x27], #0x2\n"
-      "str h20, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
-      "str h28, [x20], #0x2\n"
+      "str h20, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
+      "str h28, [x21], #0x2\n"
       "tbz x9, #0, 118f\n"
       "st1 { v16.b }[2], [x27]\n"
-      "st1 { v20.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
-      "st1 { v28.b }[2], [x20]\n"
+      "st1 { v20.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
+      "st1 { v28.b }[2], [x21]\n"
       "b 118f\n"
       "117:"  // Height 4: Partial direct writeback: partial_1_0
       "str b16, [x27, #0x0]\n"
-      "str b20, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
-      "str b28, [x20, #0x0]\n"
+      "str b20, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
       "118:"  // Height 4: Partial direct writeback: Done
       "b 120f\n"
       "119:"  // Height 4: Full writeback
       "str q16, [x27, #0x0]\n"
       "add x27, x27, #0x10\n"
-      "str q20, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
-      "str q28, [x20, #0x0]\n"
+      "str q20, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
       "120:"  // Height 4: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 92b\n"
@@ -2017,7 +2016,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "122:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp
index 8a47701..17e7405 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -81,7 +81,7 @@
                 case CPUModel::A510:
                     return { 28.00 };
                 case CPUModel::V1:
-                    return { 68.98 };
+                    return { 62.26 };
             }
         }
 
@@ -98,5 +98,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp
index f808cb1..1335b35 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp
@@ -78,7 +78,6 @@
         flags |= 0x20;
     }
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x4\n"
       "bge 97f\n"
@@ -106,11 +105,11 @@
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
       "cbnz x26, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -131,35 +130,35 @@
       "ldr q4, [x28, #0x60]\n"
       "blt 9f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v0.2d, v1.2d, v27.2d\n"
       ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0x70]\n"
-      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q25, [x28, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v27.2d\n"
       ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x28, #0x80]\n"
+      "ldr q24, [x28, #0x80]\n"
       ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x28, #0x90]\n"
+      "ldr q30, [x28, #0x90]\n"
       ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
-      "ldr q8, [x28, #0xa0]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
-      "ldr q9, [x28, #0xb0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
-      "ldr q10, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
-      "ldr q4, [x28, #0xd0]\n"
-      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0xe0]\n"
-      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x28, #0xf0]\n"
-      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x6e99a417  // ummla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x6e98a430  // ummla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
+      ".inst 0x6e9ea434  // ummla v20.4s, v1.16b, v30.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e9da431  // ummla v17.4s, v1.16b, v29.16b\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
-      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
-      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
-      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
-      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e9ca435  // ummla v21.4s, v1.16b, v28.16b\n"
+      ".inst 0x6e9ba432  // ummla v18.4s, v1.16b, v27.16b\n"
+      ".inst 0x6e9aa436  // ummla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e99a433  // ummla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e98a437  // ummla v23.4s, v1.16b, v24.16b\n"
       "tbnz %x[flags], #31, 8f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
@@ -177,36 +176,36 @@
       "prfm pldl1keep, [x24, #0x80]\n"
       "bge 7b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v0.2d, v1.2d, v24.2d\n"
       ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0x70]\n"
-      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q25, [x28, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v24.2d\n"
       ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x28, #0x80]\n"
+      "ldr q24, [x28, #0x80]\n"
       ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x28, #0x90]\n"
+      "ldr q30, [x28, #0x90]\n"
       ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
-      "ldr q8, [x28, #0xa0]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
-      "ldr q9, [x28, #0xb0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
-      "ldr q10, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
-      "ldr q4, [x28, #0xd0]\n"
-      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0xe0]\n"
-      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x28, #0xf0]\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x6e99a417  // ummla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x6e98a430  // ummla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
       "sub x25, x25, #0x10\n"
-      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e9ea434  // ummla v20.4s, v1.16b, v30.16b\n"
+      ".inst 0x6e9da431  // ummla v17.4s, v1.16b, v29.16b\n"
       "add x24, x24, #0x10\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
-      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
-      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
-      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
-      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e9ca435  // ummla v21.4s, v1.16b, v28.16b\n"
+      ".inst 0x6e9ba432  // ummla v18.4s, v1.16b, v27.16b\n"
+      ".inst 0x6e9aa436  // ummla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e99a433  // ummla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e98a437  // ummla v23.4s, v1.16b, v24.16b\n"
       "tbnz %x[flags], #31, 10f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
@@ -217,29 +216,29 @@
       "cmp x25, #0x8\n"
       "blt 14f\n"
       "12:"  // Height 1: Multiply loop: Odd block loop
-      "ldr d1, [x24], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d25, [x24], #0x8\n"
+      "trn1 v0.2d, v25.2d, v24.2d\n"
       "tbnz %x[flags], #31, 13f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "13:"  // Height 1: Multiply loop: unique 3: skip row sum
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      ".inst 0x6e88a410  // ummla v16.4s, v0.16b, v8.16b\n"
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      ".inst 0x6e98a410  // ummla v16.4s, v0.16b, v24.16b\n"
       "sub x25, x25, #0x8\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q4, [x28, #0x30]\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
       "cmp x25, #0x8\n"
-      ".inst 0x6e89a414  // ummla v20.4s, v0.16b, v9.16b\n"
-      "ldr q5, [x28, #0x40]\n"
-      "ldr q6, [x28, #0x50]\n"
-      ".inst 0x6e8aa411  // ummla v17.4s, v0.16b, v10.16b\n"
-      ".inst 0x6e84a415  // ummla v21.4s, v0.16b, v4.16b\n"
-      "ldr q7, [x28, #0x60]\n"
-      "ldr q8, [x28, #0x70]\n"
-      ".inst 0x6e85a412  // ummla v18.4s, v0.16b, v5.16b\n"
-      ".inst 0x6e86a416  // ummla v22.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e87a413  // ummla v19.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e88a417  // ummla v23.4s, v0.16b, v8.16b\n"
+      ".inst 0x6e9aa414  // ummla v20.4s, v0.16b, v26.16b\n"
+      "ldr q27, [x28, #0x40]\n"
+      "ldr q26, [x28, #0x50]\n"
+      ".inst 0x6e99a411  // ummla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a415  // ummla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x6e9ba412  // ummla v18.4s, v0.16b, v27.16b\n"
+      ".inst 0x6e9aa416  // ummla v22.4s, v0.16b, v26.16b\n"
+      ".inst 0x6e99a413  // ummla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a417  // ummla v23.4s, v0.16b, v24.16b\n"
       "add x28, x28, #0x80\n"
       "bge 12b\n"
       "14:"  // Height 1: Multiply loop: Skip odd blocks
@@ -264,26 +263,26 @@
       "17:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr b1, [x24, #0x0]\n"
       "18:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v0.2d, v1.2d, v24.2d\n"
       "tbnz %x[flags], #31, 19f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "19:"  // Height 1: Multiply loop: unique 4: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x6e8aa410  // ummla v16.4s, v0.16b, v10.16b\n"
-      ".inst 0x6e84a414  // ummla v20.4s, v0.16b, v4.16b\n"
-      "ldr q5, [x28, #0x20]\n"
-      "ldr q6, [x28, #0x30]\n"
-      ".inst 0x6e85a411  // ummla v17.4s, v0.16b, v5.16b\n"
-      ".inst 0x6e86a415  // ummla v21.4s, v0.16b, v6.16b\n"
-      "ldr q7, [x28, #0x40]\n"
-      "ldr q8, [x28, #0x50]\n"
-      ".inst 0x6e87a412  // ummla v18.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e88a416  // ummla v22.4s, v0.16b, v8.16b\n"
-      "ldr q9, [x28, #0x60]\n"
-      "ldr q10, [x28, #0x70]\n"
-      ".inst 0x6e89a413  // ummla v19.4s, v0.16b, v9.16b\n"
-      ".inst 0x6e8aa417  // ummla v23.4s, v0.16b, v10.16b\n"
+      "ldr q25, [x28, #0x0]\n"
+      "ldr q24, [x28, #0x10]\n"
+      ".inst 0x6e99a410  // ummla v16.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a414  // ummla v20.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x6e99a411  // ummla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a415  // ummla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x40]\n"
+      "ldr q24, [x28, #0x50]\n"
+      ".inst 0x6e99a412  // ummla v18.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a416  // ummla v22.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x6e99a413  // ummla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a417  // ummla v23.4s, v0.16b, v24.16b\n"
       "add x28, x28, #0x80\n"
       "20:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -297,75 +296,75 @@
       "uzp1 v19.2d, v19.2d, v23.2d\n"
       "mov v23.16b, v16.16b\n"
       "tbnz %x[flags], #31, 21f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v1.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v16.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "neg v1.4s, v1.4s\n"
+      "neg v16.4s, v16.4s\n"
       "dup v11.4s, v11.s[0]\n"
-      "mul v11.4s, v11.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v16.4s\n"
       "21:"  // Height 1: skip row sum fixup
-      "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q24, [x10, #0x0]\n"
+      "ldr q22, [x10, #0x10]\n"
       "add v23.4s, v23.4s, v11.4s\n"
       "add v17.4s, v17.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q21, [x10, #0x20]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add v18.4s, v18.4s, v11.4s\n"
       "add v19.4s, v19.4s, v11.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v16.4s }, [x20]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add v23.4s, v23.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v24.4s\n"
+      "add v17.4s, v17.4s, v22.4s\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v19.4s, v19.4s, v20.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v16.4s\n"
       "add x10, x10, #0x40\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v16.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v16.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v16.4s\n"
       "tbz %x[flags], #5, 22f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "and v5.16b, v17.16b, v0.16b\n"
-      "and v6.16b, v18.16b, v0.16b\n"
-      "and v7.16b, v19.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v22.16b, v23.16b, v0.16b\n"
+      "and v21.16b, v17.16b, v0.16b\n"
+      "and v20.16b, v18.16b, v0.16b\n"
+      "and v16.16b, v19.16b, v0.16b\n"
+      "sshr v22.4s, v22.4s, #0x1f\n"
+      "sshr v21.4s, v21.4s, #0x1f\n"
+      "sshr v20.4s, v20.4s, #0x1f\n"
+      "sshr v16.4s, v16.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v22.4s\n"
+      "sqadd v17.4s, v17.4s, v21.4s\n"
+      "sqadd v18.4s, v18.4s, v20.4s\n"
+      "sqadd v19.4s, v19.4s, v16.4s\n"
       "22:"  // Height 1: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v21.4s }, [x20]\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v20.4s }, [x20]\n"
+      "add v23.4s, v23.4s, v21.4s\n"
+      "add v17.4s, v17.4s, v21.4s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v16.4s }, [x20]\n"
+      "add v18.4s, v18.4s, v21.4s\n"
+      "add v19.4s, v19.4s, v21.4s\n"
       "cmp x9, #0x10\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "smin v23.4s, v23.4s, v20.4s\n"
+      "smin v17.4s, v17.4s, v20.4s\n"
+      "smin v18.4s, v18.4s, v20.4s\n"
+      "smin v19.4s, v19.4s, v20.4s\n"
+      "smax v23.4s, v23.4s, v16.4s\n"
+      "smax v17.4s, v17.4s, v16.4s\n"
+      "smax v18.4s, v18.4s, v16.4s\n"
+      "smax v19.4s, v19.4s, v16.4s\n"
       "uzp1 v23.8h, v23.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
-      "uzp1 v23.16b, v23.16b, v17.16b\n"
+      "uzp1 v16.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.16b, v23.16b, v16.16b\n"
       "bge 31f\n"
       "tbz x9, #3, 26f\n"
       "str d23, [x27], #0x8\n"
@@ -442,12 +441,12 @@
       "36:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 37f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
       "cbnz x26, 38f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -455,7 +454,7 @@
       "b 38f\n"
       "37:"  // Height 2: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
+      "add x23, x24, x21\n"
       "38:"  // Height 2: input setup done
       "cmp x25, #0x10\n"
       "blt 43f\n"
@@ -473,34 +472,34 @@
       "39:"  // Height 2: Multiply loop: Main loop head
       "trn1 v0.2d, v1.2d, v2.2d\n"
       ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0x70]\n"
+      "ldr q25, [x28, #0x70]\n"
       ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x28, #0x80]\n"
+      "ldr q24, [x28, #0x80]\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
       ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x28, #0x90]\n"
+      "ldr q30, [x28, #0x90]\n"
       ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
-      "ldr q8, [x28, #0xa0]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
-      "ldr q9, [x28, #0xb0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
-      "ldr q10, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
-      "ldr q4, [x28, #0xd0]\n"
-      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0xe0]\n"
-      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x28, #0xf0]\n"
-      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x6e99a417  // ummla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x6e98a430  // ummla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
+      ".inst 0x6e9ea434  // ummla v20.4s, v1.16b, v30.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e9da431  // ummla v17.4s, v1.16b, v29.16b\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x6e9ca435  // ummla v21.4s, v1.16b, v28.16b\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
-      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
-      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
-      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e9ba432  // ummla v18.4s, v1.16b, v27.16b\n"
+      ".inst 0x6e9aa436  // ummla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e99a433  // ummla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e98a437  // ummla v23.4s, v1.16b, v24.16b\n"
       "tbnz %x[flags], #31, 40f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
@@ -522,35 +521,35 @@
       "41:"  // Height 2: Multiply loop: Single iteration only
       "trn1 v0.2d, v1.2d, v2.2d\n"
       ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0x70]\n"
+      "ldr q25, [x28, #0x70]\n"
       ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x28, #0x80]\n"
+      "ldr q24, [x28, #0x80]\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
       ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x28, #0x90]\n"
+      "ldr q30, [x28, #0x90]\n"
       ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
-      "ldr q8, [x28, #0xa0]\n"
+      "ldr q29, [x28, #0xa0]\n"
       ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
-      "ldr q9, [x28, #0xb0]\n"
+      "ldr q28, [x28, #0xb0]\n"
       ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
-      "ldr q10, [x28, #0xc0]\n"
+      "ldr q27, [x28, #0xc0]\n"
       ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
-      "ldr q4, [x28, #0xd0]\n"
-      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
-      "ldr q5, [x28, #0xe0]\n"
-      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x28, #0xf0]\n"
+      "ldr q26, [x28, #0xd0]\n"
+      ".inst 0x6e99a417  // ummla v23.4s, v0.16b, v25.16b\n"
+      "ldr q25, [x28, #0xe0]\n"
+      ".inst 0x6e98a430  // ummla v16.4s, v1.16b, v24.16b\n"
+      "ldr q24, [x28, #0xf0]\n"
       "sub x25, x25, #0x10\n"
-      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e9ea434  // ummla v20.4s, v1.16b, v30.16b\n"
+      ".inst 0x6e9da431  // ummla v17.4s, v1.16b, v29.16b\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
-      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x6e9ca435  // ummla v21.4s, v1.16b, v28.16b\n"
+      ".inst 0x6e9ba432  // ummla v18.4s, v1.16b, v27.16b\n"
       "add x28, x28, #0x100\n"
-      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
-      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
-      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e9aa436  // ummla v22.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e99a433  // ummla v19.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e98a437  // ummla v23.4s, v1.16b, v24.16b\n"
       "tbnz %x[flags], #31, 42f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
@@ -562,30 +561,30 @@
       "cmp x25, #0x8\n"
       "blt 46f\n"
       "44:"  // Height 2: Multiply loop: Odd block loop
-      "ldr d1, [x24], #0x8\n"
-      "ldr d2, [x23], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "trn1 v0.2d, v25.2d, v24.2d\n"
       "tbnz %x[flags], #31, 45f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "45:"  // Height 2: Multiply loop: unique 7: skip row sum
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      ".inst 0x6e88a410  // ummla v16.4s, v0.16b, v8.16b\n"
+      "ldr q24, [x28, #0x0]\n"
+      "ldr q26, [x28, #0x10]\n"
+      ".inst 0x6e98a410  // ummla v16.4s, v0.16b, v24.16b\n"
       "sub x25, x25, #0x8\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q4, [x28, #0x30]\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
       "cmp x25, #0x8\n"
-      ".inst 0x6e89a414  // ummla v20.4s, v0.16b, v9.16b\n"
-      "ldr q5, [x28, #0x40]\n"
-      "ldr q6, [x28, #0x50]\n"
-      ".inst 0x6e8aa411  // ummla v17.4s, v0.16b, v10.16b\n"
-      ".inst 0x6e84a415  // ummla v21.4s, v0.16b, v4.16b\n"
-      "ldr q7, [x28, #0x60]\n"
-      "ldr q8, [x28, #0x70]\n"
-      ".inst 0x6e85a412  // ummla v18.4s, v0.16b, v5.16b\n"
-      ".inst 0x6e86a416  // ummla v22.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e87a413  // ummla v19.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e88a417  // ummla v23.4s, v0.16b, v8.16b\n"
+      ".inst 0x6e9aa414  // ummla v20.4s, v0.16b, v26.16b\n"
+      "ldr q27, [x28, #0x40]\n"
+      "ldr q26, [x28, #0x50]\n"
+      ".inst 0x6e99a411  // ummla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a415  // ummla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x6e9ba412  // ummla v18.4s, v0.16b, v27.16b\n"
+      ".inst 0x6e9aa416  // ummla v22.4s, v0.16b, v26.16b\n"
+      ".inst 0x6e99a413  // ummla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a417  // ummla v23.4s, v0.16b, v24.16b\n"
       "add x28, x28, #0x80\n"
       "bge 44b\n"
       "46:"  // Height 2: Multiply loop: Skip odd blocks
@@ -621,22 +620,22 @@
       "tbnz %x[flags], #31, 51f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       "51:"  // Height 2: Multiply loop: unique 8: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x6e8aa410  // ummla v16.4s, v0.16b, v10.16b\n"
-      ".inst 0x6e84a414  // ummla v20.4s, v0.16b, v4.16b\n"
-      "ldr q5, [x28, #0x20]\n"
-      "ldr q6, [x28, #0x30]\n"
-      ".inst 0x6e85a411  // ummla v17.4s, v0.16b, v5.16b\n"
-      ".inst 0x6e86a415  // ummla v21.4s, v0.16b, v6.16b\n"
-      "ldr q7, [x28, #0x40]\n"
-      "ldr q8, [x28, #0x50]\n"
-      ".inst 0x6e87a412  // ummla v18.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e88a416  // ummla v22.4s, v0.16b, v8.16b\n"
-      "ldr q9, [x28, #0x60]\n"
-      "ldr q10, [x28, #0x70]\n"
-      ".inst 0x6e89a413  // ummla v19.4s, v0.16b, v9.16b\n"
-      ".inst 0x6e8aa417  // ummla v23.4s, v0.16b, v10.16b\n"
+      "ldr q25, [x28, #0x0]\n"
+      "ldr q24, [x28, #0x10]\n"
+      ".inst 0x6e99a410  // ummla v16.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a414  // ummla v20.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x20]\n"
+      "ldr q24, [x28, #0x30]\n"
+      ".inst 0x6e99a411  // ummla v17.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a415  // ummla v21.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x40]\n"
+      "ldr q24, [x28, #0x50]\n"
+      ".inst 0x6e99a412  // ummla v18.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a416  // ummla v22.4s, v0.16b, v24.16b\n"
+      "ldr q25, [x28, #0x60]\n"
+      "ldr q24, [x28, #0x70]\n"
+      ".inst 0x6e99a413  // ummla v19.4s, v0.16b, v25.16b\n"
+      ".inst 0x6e98a417  // ummla v23.4s, v0.16b, v24.16b\n"
       "add x28, x28, #0x80\n"
       "52:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -644,127 +643,127 @@
       "cmp x26, x20\n"
       "bne 36b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "uzp1 v4.2d, v16.2d, v20.2d\n"
-      "add x22, x27, x20\n"
+      "uzp1 v24.2d, v16.2d, v20.2d\n"
+      "add x23, x27, x20\n"
       "uzp2 v16.2d, v16.2d, v20.2d\n"
       "uzp1 v20.2d, v17.2d, v21.2d\n"
       "uzp2 v17.2d, v17.2d, v21.2d\n"
       "prfm pstl1keep, [x27, #0x0]\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "uzp1 v21.2d, v18.2d, v22.2d\n"
       "uzp2 v18.2d, v18.2d, v22.2d\n"
       "uzp1 v22.2d, v19.2d, v23.2d\n"
       "uzp2 v19.2d, v19.2d, v23.2d\n"
-      "mov v23.16b, v4.16b\n"
+      "mov v23.16b, v24.16b\n"
       "tbnz %x[flags], #31, 53f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v2.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "neg v2.4s, v2.4s\n"
+      "neg v24.4s, v24.4s\n"
       "dup v12.4s, v11.s[3]\n"
       "dup v11.4s, v11.s[0]\n"
-      "mul v11.4s, v11.4s, v2.4s\n"
-      "mul v12.4s, v12.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v24.4s\n"
+      "mul v12.4s, v12.4s, v24.4s\n"
       "53:"  // Height 2: skip row sum fixup
-      "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q28, [x10, #0x0]\n"
+      "ldr q27, [x10, #0x10]\n"
       "add v23.4s, v23.4s, v11.4s\n"
       "add v20.4s, v20.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q26, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x30]\n"
       "add v21.4s, v21.4s, v11.4s\n"
       "add v22.4s, v22.4s, v11.4s\n"
       "add v16.4s, v16.4s, v12.4s\n"
       "add v17.4s, v17.4s, v12.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "add v18.4s, v18.4s, v12.4s\n"
       "add v19.4s, v19.4s, v12.4s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "add v23.4s, v23.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v1.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v23.4s, v23.4s, v28.4s\n"
+      "add v20.4s, v20.4s, v27.4s\n"
       "add x10, x10, #0x40\n"
-      "add v21.4s, v21.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v3.4s\n"
-      "add v16.4s, v16.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v25.4s\n"
+      "add v16.4s, v16.4s, v28.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v17.4s, v17.4s, v27.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v25.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v24.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v24.4s\n"
       "tbz %x[flags], #5, 54f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v23.4s, v23.4s, v4.4s\n"
-      "and v5.16b, v20.16b, v0.16b\n"
-      "and v6.16b, v21.16b, v0.16b\n"
-      "and v7.16b, v22.16b, v0.16b\n"
-      "and v8.16b, v16.16b, v0.16b\n"
-      "and v9.16b, v17.16b, v0.16b\n"
-      "and v10.16b, v18.16b, v0.16b\n"
-      "and v4.16b, v19.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v20.4s, v20.4s, v5.4s\n"
-      "sqadd v21.4s, v21.4s, v6.4s\n"
-      "sqadd v22.4s, v22.4s, v7.4s\n"
-      "sqadd v16.4s, v16.4s, v8.4s\n"
-      "sqadd v17.4s, v17.4s, v9.4s\n"
-      "sqadd v18.4s, v18.4s, v10.4s\n"
-      "sqadd v19.4s, v19.4s, v4.4s\n"
+      "and v24.16b, v23.16b, v0.16b\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v24.4s\n"
+      "and v30.16b, v20.16b, v0.16b\n"
+      "and v29.16b, v21.16b, v0.16b\n"
+      "and v28.16b, v22.16b, v0.16b\n"
+      "and v27.16b, v16.16b, v0.16b\n"
+      "and v26.16b, v17.16b, v0.16b\n"
+      "and v25.16b, v18.16b, v0.16b\n"
+      "and v24.16b, v19.16b, v0.16b\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v27.4s, v27.4s, #0x1f\n"
+      "sshr v26.4s, v26.4s, #0x1f\n"
+      "sshr v25.4s, v25.4s, #0x1f\n"
+      "sshr v24.4s, v24.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v30.4s\n"
+      "sqadd v21.4s, v21.4s, v29.4s\n"
+      "sqadd v22.4s, v22.4s, v28.4s\n"
+      "sqadd v16.4s, v16.4s, v27.4s\n"
+      "sqadd v17.4s, v17.4s, v26.4s\n"
+      "sqadd v18.4s, v18.4s, v25.4s\n"
+      "sqadd v19.4s, v19.4s, v24.4s\n"
       "54:"  // Height 2: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v26.4s }, [x20]\n"
       "srshl v23.4s, v23.4s, v0.4s\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v25.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v24.4s }, [x20]\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
       "cmp x9, #0x10\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
+      "add v23.4s, v23.4s, v26.4s\n"
+      "add v20.4s, v20.4s, v26.4s\n"
+      "add v21.4s, v21.4s, v26.4s\n"
+      "add v22.4s, v22.4s, v26.4s\n"
+      "add v16.4s, v16.4s, v26.4s\n"
+      "add v17.4s, v17.4s, v26.4s\n"
+      "add v18.4s, v18.4s, v26.4s\n"
+      "add v19.4s, v19.4s, v26.4s\n"
+      "smin v23.4s, v23.4s, v25.4s\n"
+      "smin v20.4s, v20.4s, v25.4s\n"
+      "smin v21.4s, v21.4s, v25.4s\n"
+      "smin v22.4s, v22.4s, v25.4s\n"
+      "smin v16.4s, v16.4s, v25.4s\n"
+      "smin v17.4s, v17.4s, v25.4s\n"
+      "smin v18.4s, v18.4s, v25.4s\n"
+      "smin v19.4s, v19.4s, v25.4s\n"
+      "smax v23.4s, v23.4s, v24.4s\n"
+      "smax v20.4s, v20.4s, v24.4s\n"
+      "smax v21.4s, v21.4s, v24.4s\n"
+      "smax v22.4s, v22.4s, v24.4s\n"
+      "smax v16.4s, v16.4s, v24.4s\n"
+      "smax v17.4s, v17.4s, v24.4s\n"
+      "smax v18.4s, v18.4s, v24.4s\n"
+      "smax v19.4s, v19.4s, v24.4s\n"
       "uzp1 v23.8h, v23.8h, v20.8h\n"
       "uzp1 v20.8h, v21.8h, v22.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
@@ -774,68 +773,68 @@
       "bge 63f\n"
       "tbz x9, #3, 58f\n"
       "str d23, [x27], #0x8\n"
-      "str d16, [x22], #0x8\n"
+      "str d16, [x23], #0x8\n"
       "tbz x9, #2, 56f\n"
       "st1 { v23.s }[2], [x27], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
       "tbz x9, #1, 55f\n"
       "st1 { v23.h }[6], [x27], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
       "tbz x9, #0, 62f\n"
       "st1 { v23.b }[14], [x27]\n"
-      "st1 { v16.b }[14], [x22]\n"
+      "st1 { v16.b }[14], [x23]\n"
       "b 62f\n"
       "55:"  // Height 2: Partial direct writeback: partial_1_12
       "tbz x9, #0, 62f\n"
       "st1 { v23.b }[12], [x27]\n"
-      "st1 { v16.b }[12], [x22]\n"
+      "st1 { v16.b }[12], [x23]\n"
       "b 62f\n"
       "56:"  // Height 2: Partial direct writeback: partial_2_8
       "tbz x9, #1, 57f\n"
       "st1 { v23.h }[4], [x27], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
       "tbz x9, #0, 62f\n"
       "st1 { v23.b }[10], [x27]\n"
-      "st1 { v16.b }[10], [x22]\n"
+      "st1 { v16.b }[10], [x23]\n"
       "b 62f\n"
       "57:"  // Height 2: Partial direct writeback: partial_1_8
       "tbz x9, #0, 62f\n"
       "st1 { v23.b }[8], [x27]\n"
-      "st1 { v16.b }[8], [x22]\n"
+      "st1 { v16.b }[8], [x23]\n"
       "b 62f\n"
       "58:"  // Height 2: Partial direct writeback: partial_4_0
       "tbz x9, #2, 60f\n"
       "str s23, [x27], #0x4\n"
-      "str s16, [x22], #0x4\n"
+      "str s16, [x23], #0x4\n"
       "tbz x9, #1, 59f\n"
       "st1 { v23.h }[2], [x27], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
       "tbz x9, #0, 62f\n"
       "st1 { v23.b }[6], [x27]\n"
-      "st1 { v16.b }[6], [x22]\n"
+      "st1 { v16.b }[6], [x23]\n"
       "b 62f\n"
       "59:"  // Height 2: Partial direct writeback: partial_1_4
       "tbz x9, #0, 62f\n"
       "st1 { v23.b }[4], [x27]\n"
-      "st1 { v16.b }[4], [x22]\n"
+      "st1 { v16.b }[4], [x23]\n"
       "b 62f\n"
       "60:"  // Height 2: Partial direct writeback: partial_2_0
       "tbz x9, #1, 61f\n"
       "str h23, [x27], #0x2\n"
-      "str h16, [x22], #0x2\n"
+      "str h16, [x23], #0x2\n"
       "tbz x9, #0, 62f\n"
       "st1 { v23.b }[2], [x27]\n"
-      "st1 { v16.b }[2], [x22]\n"
+      "st1 { v16.b }[2], [x23]\n"
       "b 62f\n"
       "61:"  // Height 2: Partial direct writeback: partial_1_0
       "str b23, [x27, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
       "62:"  // Height 2: Partial direct writeback: Done
       "b 64f\n"
       "63:"  // Height 2: Full writeback
       "str q23, [x27, #0x0]\n"
       "add x27, x27, #0x10\n"
-      "str q16, [x22, #0x0]\n"
+      "str q16, [x23, #0x0]\n"
       "64:"  // Height 2: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 34b\n"
@@ -872,13 +871,13 @@
       "68:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 69f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
       "cbnz x26, 70f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -887,8 +886,8 @@
       "b 70f\n"
       "69:"  // Height 3: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "70:"  // Height 3: input setup done
       "cmp x25, #0x10\n"
       "blt 75f\n"
@@ -909,12 +908,12 @@
       ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
       "trn1 v2.2d, v3.2d, v4.2d\n"
       ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
-      "ldr q5, [x28, #0x70]\n"
+      "ldr q14, [x28, #0x70]\n"
       ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
-      "ldr q4, [x28, #0x60]\n"
+      "ldr q5, [x28, #0x60]\n"
       ".inst 0x6e86a45c  // ummla v28.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x28, #0x80]\n"
+      "ldr q4, [x28, #0x80]\n"
       ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
       ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
       "ldr q7, [x28, #0x90]\n"
@@ -930,15 +929,15 @@
       ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
       ".inst 0x6e8aa45e  // ummla v30.4s, v2.16b, v10.16b\n"
       "ldr q10, [x28, #0xc0]\n"
-      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
-      ".inst 0x6e84a45b  // ummla v27.4s, v2.16b, v4.16b\n"
-      "ldr q4, [x28, #0xd0]\n"
-      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
-      ".inst 0x6e85a45f  // ummla v31.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e85a413  // ummla v19.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45b  // ummla v27.4s, v2.16b, v5.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x6e8ea417  // ummla v23.4s, v0.16b, v14.16b\n"
+      ".inst 0x6e8ea45f  // ummla v31.4s, v2.16b, v14.16b\n"
       "ldr q5, [x28, #0xe0]\n"
-      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a478  // ummla v24.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x28, #0xf0]\n"
+      ".inst 0x6e84a430  // ummla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a478  // ummla v24.4s, v3.16b, v4.16b\n"
+      "ldr q4, [x28, #0xf0]\n"
       "add x28, x28, #0x100\n"
       ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
       ".inst 0x6e87a47c  // ummla v28.4s, v3.16b, v7.16b\n"
@@ -948,12 +947,12 @@
       ".inst 0x6e89a47d  // ummla v29.4s, v3.16b, v9.16b\n"
       ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
       ".inst 0x6e8aa47a  // ummla v26.4s, v3.16b, v10.16b\n"
-      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
-      ".inst 0x6e84a47e  // ummla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e86a436  // ummla v22.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a47e  // ummla v30.4s, v3.16b, v6.16b\n"
       ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
       ".inst 0x6e85a47b  // ummla v27.4s, v3.16b, v5.16b\n"
-      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a47f  // ummla v31.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e84a437  // ummla v23.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a47f  // ummla v31.4s, v3.16b, v4.16b\n"
       "tbnz %x[flags], #31, 72f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
@@ -981,12 +980,12 @@
       ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
       "trn1 v2.2d, v3.2d, v4.2d\n"
       ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
-      "ldr q5, [x28, #0x70]\n"
+      "ldr q14, [x28, #0x70]\n"
       ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
-      "ldr q4, [x28, #0x60]\n"
+      "ldr q5, [x28, #0x60]\n"
       ".inst 0x6e86a45c  // ummla v28.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x28, #0x80]\n"
+      "ldr q4, [x28, #0x80]\n"
       ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
       ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
       "ldr q7, [x28, #0x90]\n"
@@ -1003,15 +1002,15 @@
       ".inst 0x6e8aa45e  // ummla v30.4s, v2.16b, v10.16b\n"
       "ldr q10, [x28, #0xc0]\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
-      ".inst 0x6e84a45b  // ummla v27.4s, v2.16b, v4.16b\n"
-      "ldr q4, [x28, #0xd0]\n"
-      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
-      ".inst 0x6e85a45f  // ummla v31.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e85a413  // ummla v19.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45b  // ummla v27.4s, v2.16b, v5.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x6e8ea417  // ummla v23.4s, v0.16b, v14.16b\n"
+      ".inst 0x6e8ea45f  // ummla v31.4s, v2.16b, v14.16b\n"
       "ldr q5, [x28, #0xe0]\n"
-      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a478  // ummla v24.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x28, #0xf0]\n"
+      ".inst 0x6e84a430  // ummla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a478  // ummla v24.4s, v3.16b, v4.16b\n"
+      "ldr q4, [x28, #0xf0]\n"
       "add x28, x28, #0x100\n"
       ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
       ".inst 0x6e87a47c  // ummla v28.4s, v3.16b, v7.16b\n"
@@ -1021,12 +1020,12 @@
       ".inst 0x6e89a47d  // ummla v29.4s, v3.16b, v9.16b\n"
       ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
       ".inst 0x6e8aa47a  // ummla v26.4s, v3.16b, v10.16b\n"
-      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
-      ".inst 0x6e84a47e  // ummla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e86a436  // ummla v22.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a47e  // ummla v30.4s, v3.16b, v6.16b\n"
       ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
       ".inst 0x6e85a47b  // ummla v27.4s, v3.16b, v5.16b\n"
-      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a47f  // ummla v31.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e84a437  // ummla v23.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a47f  // ummla v31.4s, v3.16b, v4.16b\n"
       "tbnz %x[flags], #31, 74f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
@@ -1042,41 +1041,41 @@
       "blt 78f\n"
       "76:"  // Height 3: Multiply loop: Odd block loop
       "ldr d1, [x24], #0x8\n"
-      "ldr d2, [x23], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr d3, [x22], #0x8\n"
-      "trn1 v2.2d, v3.2d, v7.2d\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v0.2d, v1.2d, v0.2d\n"
+      "ldr d1, [x22], #0x8\n"
+      "trn1 v2.2d, v1.2d, v2.2d\n"
       "tbnz %x[flags], #31, 77f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "77:"  // Height 3: Multiply loop: unique 11: skip row sum
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      ".inst 0x6e88a410  // ummla v16.4s, v0.16b, v8.16b\n"
-      ".inst 0x6e88a458  // ummla v24.4s, v2.16b, v8.16b\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q4, [x28, #0x30]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q1, [x28, #0x10]\n"
+      ".inst 0x6e83a410  // ummla v16.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a458  // ummla v24.4s, v2.16b, v3.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
       "sub x25, x25, #0x8\n"
       "cmp x25, #0x8\n"
       "ldr q5, [x28, #0x40]\n"
-      "ldr q6, [x28, #0x50]\n"
-      ".inst 0x6e89a414  // ummla v20.4s, v0.16b, v9.16b\n"
-      ".inst 0x6e89a45c  // ummla v28.4s, v2.16b, v9.16b\n"
-      "ldr q7, [x28, #0x60]\n"
-      "ldr q8, [x28, #0x70]\n"
-      ".inst 0x6e8aa411  // ummla v17.4s, v0.16b, v10.16b\n"
-      ".inst 0x6e8aa459  // ummla v25.4s, v2.16b, v10.16b\n"
-      ".inst 0x6e84a415  // ummla v21.4s, v0.16b, v4.16b\n"
-      ".inst 0x6e84a45d  // ummla v29.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x6e81a414  // ummla v20.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a45c  // ummla v28.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a415  // ummla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a45d  // ummla v29.4s, v2.16b, v6.16b\n"
       "add x28, x28, #0x80\n"
       ".inst 0x6e85a412  // ummla v18.4s, v0.16b, v5.16b\n"
       ".inst 0x6e85a45a  // ummla v26.4s, v2.16b, v5.16b\n"
-      ".inst 0x6e86a416  // ummla v22.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a45e  // ummla v30.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a413  // ummla v19.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a45b  // ummla v27.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e88a417  // ummla v23.4s, v0.16b, v8.16b\n"
-      ".inst 0x6e88a45f  // ummla v31.4s, v2.16b, v8.16b\n"
+      ".inst 0x6e84a416  // ummla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a45e  // ummla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e83a413  // ummla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a45b  // ummla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e81a417  // ummla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a45f  // ummla v31.4s, v2.16b, v1.16b\n"
       "bge 76b\n"
       "78:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x25, 84f\n"
@@ -1115,52 +1114,52 @@
       "ldr b3, [x22, #0x0]\n"
       "82:"  // Height 3: Multiply loop: Ragged operand read: Done
       "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v9.2d\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
       "tbnz %x[flags], #31, 83f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "83:"  // Height 3: Multiply loop: unique 12: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x6e8aa410  // ummla v16.4s, v0.16b, v10.16b\n"
-      ".inst 0x6e8aa458  // ummla v24.4s, v2.16b, v10.16b\n"
-      "ldr q5, [x28, #0x20]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q3, [x28, #0x10]\n"
+      ".inst 0x6e81a410  // ummla v16.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a458  // ummla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x28, #0x20]\n"
       "ldr q6, [x28, #0x30]\n"
-      ".inst 0x6e84a414  // ummla v20.4s, v0.16b, v4.16b\n"
-      ".inst 0x6e84a45c  // ummla v28.4s, v2.16b, v4.16b\n"
-      "ldr q7, [x28, #0x40]\n"
-      "ldr q8, [x28, #0x50]\n"
-      ".inst 0x6e85a411  // ummla v17.4s, v0.16b, v5.16b\n"
-      ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
-      "ldr q9, [x28, #0x60]\n"
-      "ldr q10, [x28, #0x70]\n"
+      ".inst 0x6e83a414  // ummla v20.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a45c  // ummla v28.4s, v2.16b, v3.16b\n"
+      "ldr q5, [x28, #0x40]\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x6e81a411  // ummla v17.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a459  // ummla v25.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
       ".inst 0x6e86a415  // ummla v21.4s, v0.16b, v6.16b\n"
       ".inst 0x6e86a45d  // ummla v29.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a412  // ummla v18.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a45a  // ummla v26.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e85a412  // ummla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45a  // ummla v26.4s, v2.16b, v5.16b\n"
       "add x28, x28, #0x80\n"
-      ".inst 0x6e88a416  // ummla v22.4s, v0.16b, v8.16b\n"
-      ".inst 0x6e88a45e  // ummla v30.4s, v2.16b, v8.16b\n"
-      ".inst 0x6e89a413  // ummla v19.4s, v0.16b, v9.16b\n"
-      ".inst 0x6e89a45b  // ummla v27.4s, v2.16b, v9.16b\n"
-      ".inst 0x6e8aa417  // ummla v23.4s, v0.16b, v10.16b\n"
-      ".inst 0x6e8aa45f  // ummla v31.4s, v2.16b, v10.16b\n"
+      ".inst 0x6e84a416  // ummla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a45e  // ummla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e83a413  // ummla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a45b  // ummla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e81a417  // ummla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a45f  // ummla v31.4s, v2.16b, v1.16b\n"
       "84:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
       "cmp x26, x20\n"
       "bne 68b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "uzp1 v4.2d, v16.2d, v20.2d\n"
-      "add x22, x27, x20\n"
-      "add x21, x22, x20\n"
+      "uzp1 v0.2d, v16.2d, v20.2d\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
       "uzp2 v16.2d, v16.2d, v20.2d\n"
       "uzp1 v20.2d, v17.2d, v21.2d\n"
       "prfm pstl1keep, [x27, #0x0]\n"
-      "prfm pstl1keep, [x22, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "uzp2 v17.2d, v17.2d, v21.2d\n"
       "uzp1 v21.2d, v18.2d, v22.2d\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
       "uzp2 v18.2d, v18.2d, v22.2d\n"
       "uzp1 v22.2d, v19.2d, v23.2d\n"
       "uzp2 v19.2d, v19.2d, v23.2d\n"
@@ -1168,116 +1167,116 @@
       "uzp1 v25.2d, v25.2d, v29.2d\n"
       "uzp1 v26.2d, v26.2d, v30.2d\n"
       "uzp1 v27.2d, v27.2d, v31.2d\n"
-      "mov v31.16b, v4.16b\n"
+      "mov v31.16b, v0.16b\n"
       "tbnz %x[flags], #31, 85f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v3.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v23.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "neg v3.4s, v3.4s\n"
+      "neg v23.4s, v23.4s\n"
       "dup v12.4s, v11.s[3]\n"
       "dup v11.4s, v11.s[0]\n"
       "dup v13.4s, v13.s[0]\n"
-      "mul v11.4s, v11.4s, v3.4s\n"
-      "mul v12.4s, v12.4s, v3.4s\n"
-      "mul v13.4s, v13.4s, v3.4s\n"
+      "mul v11.4s, v11.4s, v23.4s\n"
+      "mul v12.4s, v12.4s, v23.4s\n"
+      "mul v13.4s, v13.4s, v23.4s\n"
       "85:"  // Height 3: skip row sum fixup
       "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q30, [x10, #0x10]\n"
       "add v31.4s, v31.4s, v11.4s\n"
       "add v20.4s, v20.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q29, [x10, #0x20]\n"
+      "ldr q28, [x10, #0x30]\n"
       "add v21.4s, v21.4s, v11.4s\n"
       "add v22.4s, v22.4s, v11.4s\n"
       "add v16.4s, v16.4s, v12.4s\n"
       "add v17.4s, v17.4s, v12.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v23.4s }, [x20]\n"
       "add v18.4s, v18.4s, v12.4s\n"
       "add v19.4s, v19.4s, v12.4s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add v24.4s, v24.4s, v13.4s\n"
       "add v25.4s, v25.4s, v13.4s\n"
       "add x10, x10, #0x40\n"
       "add v26.4s, v26.4s, v13.4s\n"
       "add v27.4s, v27.4s, v13.4s\n"
       "add v31.4s, v31.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v1.4s\n"
-      "add v21.4s, v21.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v30.4s\n"
+      "add v21.4s, v21.4s, v29.4s\n"
+      "add v22.4s, v22.4s, v28.4s\n"
       "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v30.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v28.4s\n"
       "add v24.4s, v24.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v30.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v28.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v23.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v23.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v23.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v23.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v23.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v23.4s\n"
       "tbz %x[flags], #5, 86f\n"
-      "and v4.16b, v31.16b, v0.16b\n"
-      "and v5.16b, v20.16b, v0.16b\n"
-      "and v6.16b, v21.16b, v0.16b\n"
-      "and v7.16b, v22.16b, v0.16b\n"
-      "and v8.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v31.4s, v31.4s, v4.4s\n"
-      "sqadd v20.4s, v20.4s, v5.4s\n"
-      "sqadd v21.4s, v21.4s, v6.4s\n"
-      "sqadd v22.4s, v22.4s, v7.4s\n"
-      "sqadd v16.4s, v16.4s, v8.4s\n"
-      "and v9.16b, v17.16b, v0.16b\n"
-      "and v10.16b, v18.16b, v0.16b\n"
-      "and v4.16b, v19.16b, v0.16b\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "and v7.16b, v26.16b, v0.16b\n"
-      "and v8.16b, v27.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v9.4s\n"
-      "sqadd v18.4s, v18.4s, v10.4s\n"
-      "sqadd v19.4s, v19.4s, v4.4s\n"
-      "sqadd v24.4s, v24.4s, v5.4s\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "and v1.16b, v31.16b, v0.16b\n"
+      "and v30.16b, v20.16b, v0.16b\n"
+      "and v29.16b, v21.16b, v0.16b\n"
+      "and v28.16b, v22.16b, v0.16b\n"
+      "and v23.16b, v16.16b, v0.16b\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v1.4s\n"
+      "sqadd v20.4s, v20.4s, v30.4s\n"
+      "sqadd v21.4s, v21.4s, v29.4s\n"
+      "sqadd v22.4s, v22.4s, v28.4s\n"
+      "sqadd v16.4s, v16.4s, v23.4s\n"
+      "and v3.16b, v17.16b, v0.16b\n"
+      "and v2.16b, v18.16b, v0.16b\n"
+      "and v1.16b, v19.16b, v0.16b\n"
+      "and v30.16b, v24.16b, v0.16b\n"
+      "and v29.16b, v25.16b, v0.16b\n"
+      "and v28.16b, v26.16b, v0.16b\n"
+      "and v23.16b, v27.16b, v0.16b\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sshr v30.4s, v30.4s, #0x1f\n"
+      "sshr v29.4s, v29.4s, #0x1f\n"
+      "sshr v28.4s, v28.4s, #0x1f\n"
+      "sshr v23.4s, v23.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v3.4s\n"
+      "sqadd v18.4s, v18.4s, v2.4s\n"
+      "sqadd v19.4s, v19.4s, v1.4s\n"
+      "sqadd v24.4s, v24.4s, v30.4s\n"
+      "sqadd v25.4s, v25.4s, v29.4s\n"
+      "sqadd v26.4s, v26.4s, v28.4s\n"
+      "sqadd v27.4s, v27.4s, v23.4s\n"
       "86:"  // Height 3: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v29.4s }, [x20]\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v28.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v23.4s }, [x20]\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
       "cmp x9, #0x10\n"
@@ -1285,132 +1284,132 @@
       "srshl v25.4s, v25.4s, v0.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v31.4s, v31.4s, v29.4s\n"
+      "add v20.4s, v20.4s, v29.4s\n"
+      "add v21.4s, v21.4s, v29.4s\n"
+      "add v22.4s, v22.4s, v29.4s\n"
+      "add v16.4s, v16.4s, v29.4s\n"
+      "add v17.4s, v17.4s, v29.4s\n"
+      "add v18.4s, v18.4s, v29.4s\n"
+      "add v19.4s, v19.4s, v29.4s\n"
+      "add v24.4s, v24.4s, v29.4s\n"
+      "add v25.4s, v25.4s, v29.4s\n"
+      "add v26.4s, v26.4s, v29.4s\n"
+      "add v27.4s, v27.4s, v29.4s\n"
+      "smin v31.4s, v31.4s, v28.4s\n"
+      "smin v20.4s, v20.4s, v28.4s\n"
+      "smin v21.4s, v21.4s, v28.4s\n"
+      "smin v22.4s, v22.4s, v28.4s\n"
+      "smin v16.4s, v16.4s, v28.4s\n"
+      "smin v17.4s, v17.4s, v28.4s\n"
+      "smin v18.4s, v18.4s, v28.4s\n"
+      "smin v19.4s, v19.4s, v28.4s\n"
+      "smin v24.4s, v24.4s, v28.4s\n"
+      "smin v25.4s, v25.4s, v28.4s\n"
+      "smin v26.4s, v26.4s, v28.4s\n"
+      "smin v27.4s, v27.4s, v28.4s\n"
+      "smax v31.4s, v31.4s, v23.4s\n"
+      "smax v20.4s, v20.4s, v23.4s\n"
+      "smax v21.4s, v21.4s, v23.4s\n"
+      "smax v22.4s, v22.4s, v23.4s\n"
+      "smax v16.4s, v16.4s, v23.4s\n"
+      "smax v17.4s, v17.4s, v23.4s\n"
+      "smax v18.4s, v18.4s, v23.4s\n"
+      "smax v19.4s, v19.4s, v23.4s\n"
+      "smax v24.4s, v24.4s, v23.4s\n"
+      "smax v25.4s, v25.4s, v23.4s\n"
+      "smax v26.4s, v26.4s, v23.4s\n"
+      "smax v27.4s, v27.4s, v23.4s\n"
       "uzp1 v31.8h, v31.8h, v20.8h\n"
       "uzp1 v20.8h, v21.8h, v22.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v18.8h, v18.8h, v19.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
       "uzp1 v31.16b, v31.16b, v20.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v16.16b, v16.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 95f\n"
       "tbz x9, #3, 90f\n"
       "str d31, [x27], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d24, [x21], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
       "tbz x9, #2, 88f\n"
       "st1 { v31.s }[2], [x27], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
-      "st1 { v24.s }[2], [x21], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v24.s }[2], [x22], #0x4\n"
       "tbz x9, #1, 87f\n"
       "st1 { v31.h }[6], [x27], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
-      "st1 { v24.h }[6], [x21], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v24.h }[6], [x22], #0x2\n"
       "tbz x9, #0, 94f\n"
       "st1 { v31.b }[14], [x27]\n"
-      "st1 { v16.b }[14], [x22]\n"
-      "st1 { v24.b }[14], [x21]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v24.b }[14], [x22]\n"
       "b 94f\n"
       "87:"  // Height 3: Partial direct writeback: partial_1_12
       "tbz x9, #0, 94f\n"
       "st1 { v31.b }[12], [x27]\n"
-      "st1 { v16.b }[12], [x22]\n"
-      "st1 { v24.b }[12], [x21]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v24.b }[12], [x22]\n"
       "b 94f\n"
       "88:"  // Height 3: Partial direct writeback: partial_2_8
       "tbz x9, #1, 89f\n"
       "st1 { v31.h }[4], [x27], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
-      "st1 { v24.h }[4], [x21], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v24.h }[4], [x22], #0x2\n"
       "tbz x9, #0, 94f\n"
       "st1 { v31.b }[10], [x27]\n"
-      "st1 { v16.b }[10], [x22]\n"
-      "st1 { v24.b }[10], [x21]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v24.b }[10], [x22]\n"
       "b 94f\n"
       "89:"  // Height 3: Partial direct writeback: partial_1_8
       "tbz x9, #0, 94f\n"
       "st1 { v31.b }[8], [x27]\n"
-      "st1 { v16.b }[8], [x22]\n"
-      "st1 { v24.b }[8], [x21]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v24.b }[8], [x22]\n"
       "b 94f\n"
       "90:"  // Height 3: Partial direct writeback: partial_4_0
       "tbz x9, #2, 92f\n"
       "str s31, [x27], #0x4\n"
-      "str s16, [x22], #0x4\n"
-      "str s24, [x21], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "str s24, [x22], #0x4\n"
       "tbz x9, #1, 91f\n"
       "st1 { v31.h }[2], [x27], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
-      "st1 { v24.h }[2], [x21], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v24.h }[2], [x22], #0x2\n"
       "tbz x9, #0, 94f\n"
       "st1 { v31.b }[6], [x27]\n"
-      "st1 { v16.b }[6], [x22]\n"
-      "st1 { v24.b }[6], [x21]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v24.b }[6], [x22]\n"
       "b 94f\n"
       "91:"  // Height 3: Partial direct writeback: partial_1_4
       "tbz x9, #0, 94f\n"
       "st1 { v31.b }[4], [x27]\n"
-      "st1 { v16.b }[4], [x22]\n"
-      "st1 { v24.b }[4], [x21]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v24.b }[4], [x22]\n"
       "b 94f\n"
       "92:"  // Height 3: Partial direct writeback: partial_2_0
       "tbz x9, #1, 93f\n"
       "str h31, [x27], #0x2\n"
-      "str h16, [x22], #0x2\n"
-      "str h24, [x21], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "str h24, [x22], #0x2\n"
       "tbz x9, #0, 94f\n"
       "st1 { v31.b }[2], [x27]\n"
-      "st1 { v16.b }[2], [x22]\n"
-      "st1 { v24.b }[2], [x21]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v24.b }[2], [x22]\n"
       "b 94f\n"
       "93:"  // Height 3: Partial direct writeback: partial_1_0
       "str b31, [x27, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
-      "str b24, [x21, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "str b24, [x22, #0x0]\n"
       "94:"  // Height 3: Partial direct writeback: Done
       "b 96f\n"
       "95:"  // Height 3: Full writeback
       "str q31, [x27, #0x0]\n"
       "add x27, x27, #0x10\n"
-      "str q16, [x22, #0x0]\n"
-      "str q24, [x21, #0x0]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q24, [x22, #0x0]\n"
       "96:"  // Height 3: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 66b\n"
@@ -1451,14 +1450,14 @@
       "100:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 101f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
       "cbnz x26, 102f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -1468,9 +1467,9 @@
       "b 102f\n"
       "101:"  // Height 4: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "102:"  // Height 4: input setup done
       "cmp x25, #0x10\n"
       "blt 107f\n"
@@ -1630,42 +1629,42 @@
       "blt 110f\n"
       "108:"  // Height 4: Multiply loop: Odd block loop
       "ldr d1, [x24], #0x8\n"
-      "ldr d2, [x23], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr d3, [x22], #0x8\n"
-      "ldr d7, [x21], #0x8\n"
-      "trn1 v2.2d, v3.2d, v7.2d\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v0.2d, v1.2d, v0.2d\n"
+      "ldr d2, [x22], #0x8\n"
+      "ldr d1, [x21], #0x8\n"
+      "trn1 v2.2d, v2.2d, v1.2d\n"
       "tbnz %x[flags], #31, 109f\n"
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "109:"  // Height 4: Multiply loop: unique 15: skip row sum
-      "ldr q8, [x28, #0x0]\n"
-      "ldr q9, [x28, #0x10]\n"
-      ".inst 0x6e88a410  // ummla v16.4s, v0.16b, v8.16b\n"
-      ".inst 0x6e88a458  // ummla v24.4s, v2.16b, v8.16b\n"
-      "ldr q10, [x28, #0x20]\n"
-      "ldr q4, [x28, #0x30]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q1, [x28, #0x10]\n"
+      ".inst 0x6e83a410  // ummla v16.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a458  // ummla v24.4s, v2.16b, v3.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
       "sub x25, x25, #0x8\n"
       "cmp x25, #0x8\n"
       "ldr q5, [x28, #0x40]\n"
-      "ldr q6, [x28, #0x50]\n"
-      ".inst 0x6e89a414  // ummla v20.4s, v0.16b, v9.16b\n"
-      ".inst 0x6e89a45c  // ummla v28.4s, v2.16b, v9.16b\n"
-      "ldr q7, [x28, #0x60]\n"
-      "ldr q8, [x28, #0x70]\n"
-      ".inst 0x6e8aa411  // ummla v17.4s, v0.16b, v10.16b\n"
-      ".inst 0x6e8aa459  // ummla v25.4s, v2.16b, v10.16b\n"
-      ".inst 0x6e84a415  // ummla v21.4s, v0.16b, v4.16b\n"
-      ".inst 0x6e84a45d  // ummla v29.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x6e81a414  // ummla v20.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a45c  // ummla v28.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a415  // ummla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a45d  // ummla v29.4s, v2.16b, v6.16b\n"
       "add x28, x28, #0x80\n"
       ".inst 0x6e85a412  // ummla v18.4s, v0.16b, v5.16b\n"
       ".inst 0x6e85a45a  // ummla v26.4s, v2.16b, v5.16b\n"
-      ".inst 0x6e86a416  // ummla v22.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a45e  // ummla v30.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a413  // ummla v19.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a45b  // ummla v27.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e88a417  // ummla v23.4s, v0.16b, v8.16b\n"
-      ".inst 0x6e88a45f  // ummla v31.4s, v2.16b, v8.16b\n"
+      ".inst 0x6e84a416  // ummla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a45e  // ummla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e83a413  // ummla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a45b  // ummla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e81a417  // ummla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a45f  // ummla v31.4s, v2.16b, v1.16b\n"
       "bge 108b\n"
       "110:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x25, 116f\n"
@@ -1716,51 +1715,51 @@
       ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
       ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
       "115:"  // Height 4: Multiply loop: unique 16: skip row sum
-      "ldr q10, [x28, #0x0]\n"
-      "ldr q4, [x28, #0x10]\n"
-      ".inst 0x6e8aa410  // ummla v16.4s, v0.16b, v10.16b\n"
-      ".inst 0x6e8aa458  // ummla v24.4s, v2.16b, v10.16b\n"
-      "ldr q5, [x28, #0x20]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q3, [x28, #0x10]\n"
+      ".inst 0x6e81a410  // ummla v16.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a458  // ummla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x28, #0x20]\n"
       "ldr q6, [x28, #0x30]\n"
-      ".inst 0x6e84a414  // ummla v20.4s, v0.16b, v4.16b\n"
-      ".inst 0x6e84a45c  // ummla v28.4s, v2.16b, v4.16b\n"
-      "ldr q7, [x28, #0x40]\n"
-      "ldr q8, [x28, #0x50]\n"
-      ".inst 0x6e85a411  // ummla v17.4s, v0.16b, v5.16b\n"
-      ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
-      "ldr q9, [x28, #0x60]\n"
-      "ldr q10, [x28, #0x70]\n"
+      ".inst 0x6e83a414  // ummla v20.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a45c  // ummla v28.4s, v2.16b, v3.16b\n"
+      "ldr q5, [x28, #0x40]\n"
+      "ldr q4, [x28, #0x50]\n"
+      ".inst 0x6e81a411  // ummla v17.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a459  // ummla v25.4s, v2.16b, v1.16b\n"
+      "ldr q3, [x28, #0x60]\n"
+      "ldr q1, [x28, #0x70]\n"
       ".inst 0x6e86a415  // ummla v21.4s, v0.16b, v6.16b\n"
       ".inst 0x6e86a45d  // ummla v29.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a412  // ummla v18.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a45a  // ummla v26.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e85a412  // ummla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45a  // ummla v26.4s, v2.16b, v5.16b\n"
       "add x28, x28, #0x80\n"
-      ".inst 0x6e88a416  // ummla v22.4s, v0.16b, v8.16b\n"
-      ".inst 0x6e88a45e  // ummla v30.4s, v2.16b, v8.16b\n"
-      ".inst 0x6e89a413  // ummla v19.4s, v0.16b, v9.16b\n"
-      ".inst 0x6e89a45b  // ummla v27.4s, v2.16b, v9.16b\n"
-      ".inst 0x6e8aa417  // ummla v23.4s, v0.16b, v10.16b\n"
-      ".inst 0x6e8aa45f  // ummla v31.4s, v2.16b, v10.16b\n"
+      ".inst 0x6e84a416  // ummla v22.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a45e  // ummla v30.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e83a413  // ummla v19.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e83a45b  // ummla v27.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e81a417  // ummla v23.4s, v0.16b, v1.16b\n"
+      ".inst 0x6e81a45f  // ummla v31.4s, v2.16b, v1.16b\n"
       "116:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
       "cmp x26, x20\n"
       "bne 100b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "uzp1 v4.2d, v16.2d, v20.2d\n"
-      "add x22, x27, x20\n"
+      "uzp1 v0.2d, v16.2d, v20.2d\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
       "add x21, x22, x20\n"
-      "add x20, x21, x20\n"
       "uzp2 v16.2d, v16.2d, v20.2d\n"
       "uzp1 v20.2d, v17.2d, v21.2d\n"
       "prfm pstl1keep, [x27, #0x0]\n"
       "uzp2 v17.2d, v17.2d, v21.2d\n"
       "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
       "prfm pstl1keep, [x22, #0x0]\n"
-      "prfm pstl1keep, [x21, #0x0]\n"
       "uzp2 v18.2d, v18.2d, v22.2d\n"
       "uzp1 v22.2d, v19.2d, v23.2d\n"
-      "prfm pstl1keep, [x20, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
       "uzp2 v19.2d, v19.2d, v23.2d\n"
       "uzp1 v23.2d, v24.2d, v28.2d\n"
       "uzp2 v24.2d, v24.2d, v28.2d\n"
@@ -1770,38 +1769,38 @@
       "uzp2 v26.2d, v26.2d, v30.2d\n"
       "uzp1 v30.2d, v27.2d, v31.2d\n"
       "uzp2 v27.2d, v27.2d, v31.2d\n"
-      "mov v31.16b, v4.16b\n"
+      "mov v31.16b, v0.16b\n"
       "tbnz %x[flags], #31, 117f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1r { v0.4s }, [x20]\n"
       "addp v11.4s, v11.4s, v11.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "neg v4.4s, v4.4s\n"
+      "neg v0.4s, v0.4s\n"
       "dup v12.4s, v11.s[3]\n"
       "dup v11.4s, v11.s[0]\n"
       "dup v14.4s, v13.s[3]\n"
       "dup v13.4s, v13.s[0]\n"
-      "mul v11.4s, v11.4s, v4.4s\n"
-      "mul v12.4s, v12.4s, v4.4s\n"
-      "mul v13.4s, v13.4s, v4.4s\n"
-      "mul v14.4s, v14.4s, v4.4s\n"
+      "mul v11.4s, v11.4s, v0.4s\n"
+      "mul v12.4s, v12.4s, v0.4s\n"
+      "mul v13.4s, v13.4s, v0.4s\n"
+      "mul v14.4s, v14.4s, v0.4s\n"
       "117:"  // Height 4: skip row sum fixup
       "ldr q0, [x10, #0x0]\n"
-      "ldr q1, [x10, #0x10]\n"
+      "ldr q4, [x10, #0x10]\n"
       "add v31.4s, v31.4s, v11.4s\n"
       "add v20.4s, v20.4s, v11.4s\n"
-      "ldr q2, [x10, #0x20]\n"
-      "ldr q3, [x10, #0x30]\n"
+      "ldr q3, [x10, #0x20]\n"
+      "ldr q2, [x10, #0x30]\n"
       "add v21.4s, v21.4s, v11.4s\n"
       "add v22.4s, v22.4s, v11.4s\n"
       "add v16.4s, v16.4s, v12.4s\n"
       "add v17.4s, v17.4s, v12.4s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "add v18.4s, v18.4s, v12.4s\n"
       "add v19.4s, v19.4s, v12.4s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add v23.4s, v23.4s, v13.4s\n"
       "add v28.4s, v28.4s, v13.4s\n"
       "add x10, x10, #0x40\n"
@@ -1812,100 +1811,100 @@
       "add v26.4s, v26.4s, v14.4s\n"
       "add v27.4s, v27.4s, v14.4s\n"
       "add v31.4s, v31.4s, v0.4s\n"
-      "add v20.4s, v20.4s, v1.4s\n"
-      "add v21.4s, v21.4s, v2.4s\n"
-      "add v22.4s, v22.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v3.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
       "add v16.4s, v16.4s, v0.4s\n"
-      "add v17.4s, v17.4s, v1.4s\n"
-      "add v18.4s, v18.4s, v2.4s\n"
-      "add v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v2.4s\n"
       "add v23.4s, v23.4s, v0.4s\n"
-      "add v28.4s, v28.4s, v1.4s\n"
-      "add v29.4s, v29.4s, v2.4s\n"
-      "add v30.4s, v30.4s, v3.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
+      "add v29.4s, v29.4s, v3.4s\n"
+      "add v30.4s, v30.4s, v2.4s\n"
       "add v24.4s, v24.4s, v0.4s\n"
-      "ld1r { v0.4s }, [x23]\n"
-      "add v25.4s, v25.4s, v1.4s\n"
-      "add v26.4s, v26.4s, v2.4s\n"
-      "add v27.4s, v27.4s, v3.4s\n"
-      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
-      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
-      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
-      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
-      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
-      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
-      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
-      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
-      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
-      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
-      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
-      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
-      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
-      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
-      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
-      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v2.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v1.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v1.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v1.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v1.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v1.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v1.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v1.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v1.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v1.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v1.4s\n"
       "tbz %x[flags], #5, 118f\n"
-      "and v4.16b, v31.16b, v0.16b\n"
-      "and v5.16b, v20.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v31.4s, v31.4s, v4.4s\n"
-      "sqadd v20.4s, v20.4s, v5.4s\n"
-      "and v6.16b, v21.16b, v0.16b\n"
-      "and v7.16b, v22.16b, v0.16b\n"
-      "and v8.16b, v16.16b, v0.16b\n"
-      "and v9.16b, v17.16b, v0.16b\n"
-      "and v10.16b, v18.16b, v0.16b\n"
-      "and v4.16b, v19.16b, v0.16b\n"
-      "and v5.16b, v23.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v2.16b, v31.16b, v0.16b\n"
+      "and v1.16b, v20.16b, v0.16b\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v2.4s\n"
+      "sqadd v20.4s, v20.4s, v1.4s\n"
+      "and v7.16b, v21.16b, v0.16b\n"
+      "and v6.16b, v22.16b, v0.16b\n"
+      "and v5.16b, v16.16b, v0.16b\n"
+      "and v4.16b, v17.16b, v0.16b\n"
+      "and v3.16b, v18.16b, v0.16b\n"
+      "and v2.16b, v19.16b, v0.16b\n"
+      "and v1.16b, v23.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v21.4s, v21.4s, v6.4s\n"
-      "sqadd v22.4s, v22.4s, v7.4s\n"
-      "sqadd v16.4s, v16.4s, v8.4s\n"
-      "sqadd v17.4s, v17.4s, v9.4s\n"
-      "sqadd v18.4s, v18.4s, v10.4s\n"
-      "sqadd v19.4s, v19.4s, v4.4s\n"
-      "sqadd v23.4s, v23.4s, v5.4s\n"
-      "and v6.16b, v28.16b, v0.16b\n"
-      "and v7.16b, v29.16b, v0.16b\n"
-      "and v8.16b, v30.16b, v0.16b\n"
-      "and v9.16b, v24.16b, v0.16b\n"
-      "and v10.16b, v25.16b, v0.16b\n"
-      "and v4.16b, v26.16b, v0.16b\n"
-      "and v5.16b, v27.16b, v0.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "sshr v8.4s, v8.4s, #0x1f\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "sqadd v28.4s, v28.4s, v6.4s\n"
-      "sqadd v29.4s, v29.4s, v7.4s\n"
-      "sqadd v30.4s, v30.4s, v8.4s\n"
-      "sqadd v24.4s, v24.4s, v9.4s\n"
-      "sqadd v25.4s, v25.4s, v10.4s\n"
-      "sqadd v26.4s, v26.4s, v4.4s\n"
-      "sqadd v27.4s, v27.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v7.4s\n"
+      "sqadd v22.4s, v22.4s, v6.4s\n"
+      "sqadd v16.4s, v16.4s, v5.4s\n"
+      "sqadd v17.4s, v17.4s, v4.4s\n"
+      "sqadd v18.4s, v18.4s, v3.4s\n"
+      "sqadd v19.4s, v19.4s, v2.4s\n"
+      "sqadd v23.4s, v23.4s, v1.4s\n"
+      "and v7.16b, v28.16b, v0.16b\n"
+      "and v6.16b, v29.16b, v0.16b\n"
+      "and v5.16b, v30.16b, v0.16b\n"
+      "and v4.16b, v24.16b, v0.16b\n"
+      "and v3.16b, v25.16b, v0.16b\n"
+      "and v2.16b, v26.16b, v0.16b\n"
+      "and v1.16b, v27.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v3.4s, v3.4s, #0x1f\n"
+      "sshr v2.4s, v2.4s, #0x1f\n"
+      "sshr v1.4s, v1.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v7.4s\n"
+      "sqadd v29.4s, v29.4s, v6.4s\n"
+      "sqadd v30.4s, v30.4s, v5.4s\n"
+      "sqadd v24.4s, v24.4s, v4.4s\n"
+      "sqadd v25.4s, v25.4s, v3.4s\n"
+      "sqadd v26.4s, v26.4s, v2.4s\n"
+      "sqadd v27.4s, v27.4s, v1.4s\n"
       "118:"  // Height 4: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1r { v4.4s }, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1r { v3.4s }, [x20]\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
       "srshl v22.4s, v22.4s, v0.4s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1r { v6.4s }, [x23]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1r { v2.4s }, [x20]\n"
       "srshl v16.4s, v16.4s, v0.4s\n"
       "srshl v17.4s, v17.4s, v0.4s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1r { v5.4s }, [x23]\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1r { v1.4s }, [x20]\n"
       "srshl v18.4s, v18.4s, v0.4s\n"
       "srshl v19.4s, v19.4s, v0.4s\n"
       "cmp x9, #0x10\n"
@@ -1917,163 +1916,163 @@
       "srshl v25.4s, v25.4s, v0.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "add v31.4s, v31.4s, v4.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
-      "add v22.4s, v22.4s, v4.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
-      "add v17.4s, v17.4s, v4.4s\n"
-      "add v18.4s, v18.4s, v4.4s\n"
-      "add v19.4s, v19.4s, v4.4s\n"
-      "add v23.4s, v23.4s, v4.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "add v26.4s, v26.4s, v4.4s\n"
-      "add v27.4s, v27.4s, v4.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
-      "smin v22.4s, v22.4s, v6.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
-      "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
-      "smin v23.4s, v23.4s, v6.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
-      "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
-      "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
-      "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
-      "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v31.4s, v31.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v3.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v3.4s\n"
+      "add v18.4s, v18.4s, v3.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v28.4s, v28.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v3.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v3.4s\n"
+      "add v26.4s, v26.4s, v3.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "smin v31.4s, v31.4s, v2.4s\n"
+      "smin v20.4s, v20.4s, v2.4s\n"
+      "smin v21.4s, v21.4s, v2.4s\n"
+      "smin v22.4s, v22.4s, v2.4s\n"
+      "smin v16.4s, v16.4s, v2.4s\n"
+      "smin v17.4s, v17.4s, v2.4s\n"
+      "smin v18.4s, v18.4s, v2.4s\n"
+      "smin v19.4s, v19.4s, v2.4s\n"
+      "smin v23.4s, v23.4s, v2.4s\n"
+      "smin v28.4s, v28.4s, v2.4s\n"
+      "smin v29.4s, v29.4s, v2.4s\n"
+      "smin v30.4s, v30.4s, v2.4s\n"
+      "smin v24.4s, v24.4s, v2.4s\n"
+      "smin v25.4s, v25.4s, v2.4s\n"
+      "smin v26.4s, v26.4s, v2.4s\n"
+      "smin v27.4s, v27.4s, v2.4s\n"
+      "smax v31.4s, v31.4s, v1.4s\n"
+      "smax v20.4s, v20.4s, v1.4s\n"
+      "smax v21.4s, v21.4s, v1.4s\n"
+      "smax v22.4s, v22.4s, v1.4s\n"
+      "smax v16.4s, v16.4s, v1.4s\n"
+      "smax v17.4s, v17.4s, v1.4s\n"
+      "smax v18.4s, v18.4s, v1.4s\n"
+      "smax v19.4s, v19.4s, v1.4s\n"
+      "smax v23.4s, v23.4s, v1.4s\n"
+      "smax v28.4s, v28.4s, v1.4s\n"
+      "smax v29.4s, v29.4s, v1.4s\n"
+      "smax v30.4s, v30.4s, v1.4s\n"
+      "smax v24.4s, v24.4s, v1.4s\n"
+      "smax v25.4s, v25.4s, v1.4s\n"
+      "smax v26.4s, v26.4s, v1.4s\n"
+      "smax v27.4s, v27.4s, v1.4s\n"
       "uzp1 v31.8h, v31.8h, v20.8h\n"
       "uzp1 v20.8h, v21.8h, v22.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v19.8h, v18.8h, v19.8h\n"
       "uzp1 v23.8h, v23.8h, v28.8h\n"
-      "uzp1 v28.8h, v29.8h, v30.8h\n"
+      "uzp1 v18.8h, v29.8h, v30.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
-      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v17.8h, v26.8h, v27.8h\n"
       "uzp1 v31.16b, v31.16b, v20.16b\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
-      "uzp1 v23.16b, v23.16b, v28.16b\n"
-      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v16.16b, v16.16b, v19.16b\n"
+      "uzp1 v23.16b, v23.16b, v18.16b\n"
+      "uzp1 v24.16b, v24.16b, v17.16b\n"
       "bge 127f\n"
       "tbz x9, #3, 122f\n"
       "str d31, [x27], #0x8\n"
-      "str d16, [x22], #0x8\n"
-      "str d23, [x21], #0x8\n"
-      "str d24, [x20], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
       "tbz x9, #2, 120f\n"
       "st1 { v31.s }[2], [x27], #0x4\n"
-      "st1 { v16.s }[2], [x22], #0x4\n"
-      "st1 { v23.s }[2], [x21], #0x4\n"
-      "st1 { v24.s }[2], [x20], #0x4\n"
+      "st1 { v16.s }[2], [x23], #0x4\n"
+      "st1 { v23.s }[2], [x22], #0x4\n"
+      "st1 { v24.s }[2], [x21], #0x4\n"
       "tbz x9, #1, 119f\n"
       "st1 { v31.h }[6], [x27], #0x2\n"
-      "st1 { v16.h }[6], [x22], #0x2\n"
-      "st1 { v23.h }[6], [x21], #0x2\n"
-      "st1 { v24.h }[6], [x20], #0x2\n"
+      "st1 { v16.h }[6], [x23], #0x2\n"
+      "st1 { v23.h }[6], [x22], #0x2\n"
+      "st1 { v24.h }[6], [x21], #0x2\n"
       "tbz x9, #0, 126f\n"
       "st1 { v31.b }[14], [x27]\n"
-      "st1 { v16.b }[14], [x22]\n"
-      "st1 { v23.b }[14], [x21]\n"
-      "st1 { v24.b }[14], [x20]\n"
+      "st1 { v16.b }[14], [x23]\n"
+      "st1 { v23.b }[14], [x22]\n"
+      "st1 { v24.b }[14], [x21]\n"
       "b 126f\n"
       "119:"  // Height 4: Partial direct writeback: partial_1_12
       "tbz x9, #0, 126f\n"
       "st1 { v31.b }[12], [x27]\n"
-      "st1 { v16.b }[12], [x22]\n"
-      "st1 { v23.b }[12], [x21]\n"
-      "st1 { v24.b }[12], [x20]\n"
+      "st1 { v16.b }[12], [x23]\n"
+      "st1 { v23.b }[12], [x22]\n"
+      "st1 { v24.b }[12], [x21]\n"
       "b 126f\n"
       "120:"  // Height 4: Partial direct writeback: partial_2_8
       "tbz x9, #1, 121f\n"
       "st1 { v31.h }[4], [x27], #0x2\n"
-      "st1 { v16.h }[4], [x22], #0x2\n"
-      "st1 { v23.h }[4], [x21], #0x2\n"
-      "st1 { v24.h }[4], [x20], #0x2\n"
+      "st1 { v16.h }[4], [x23], #0x2\n"
+      "st1 { v23.h }[4], [x22], #0x2\n"
+      "st1 { v24.h }[4], [x21], #0x2\n"
       "tbz x9, #0, 126f\n"
       "st1 { v31.b }[10], [x27]\n"
-      "st1 { v16.b }[10], [x22]\n"
-      "st1 { v23.b }[10], [x21]\n"
-      "st1 { v24.b }[10], [x20]\n"
+      "st1 { v16.b }[10], [x23]\n"
+      "st1 { v23.b }[10], [x22]\n"
+      "st1 { v24.b }[10], [x21]\n"
       "b 126f\n"
       "121:"  // Height 4: Partial direct writeback: partial_1_8
       "tbz x9, #0, 126f\n"
       "st1 { v31.b }[8], [x27]\n"
-      "st1 { v16.b }[8], [x22]\n"
-      "st1 { v23.b }[8], [x21]\n"
-      "st1 { v24.b }[8], [x20]\n"
+      "st1 { v16.b }[8], [x23]\n"
+      "st1 { v23.b }[8], [x22]\n"
+      "st1 { v24.b }[8], [x21]\n"
       "b 126f\n"
       "122:"  // Height 4: Partial direct writeback: partial_4_0
       "tbz x9, #2, 124f\n"
       "str s31, [x27], #0x4\n"
-      "str s16, [x22], #0x4\n"
-      "str s23, [x21], #0x4\n"
-      "str s24, [x20], #0x4\n"
+      "str s16, [x23], #0x4\n"
+      "str s23, [x22], #0x4\n"
+      "str s24, [x21], #0x4\n"
       "tbz x9, #1, 123f\n"
       "st1 { v31.h }[2], [x27], #0x2\n"
-      "st1 { v16.h }[2], [x22], #0x2\n"
-      "st1 { v23.h }[2], [x21], #0x2\n"
-      "st1 { v24.h }[2], [x20], #0x2\n"
+      "st1 { v16.h }[2], [x23], #0x2\n"
+      "st1 { v23.h }[2], [x22], #0x2\n"
+      "st1 { v24.h }[2], [x21], #0x2\n"
       "tbz x9, #0, 126f\n"
       "st1 { v31.b }[6], [x27]\n"
-      "st1 { v16.b }[6], [x22]\n"
-      "st1 { v23.b }[6], [x21]\n"
-      "st1 { v24.b }[6], [x20]\n"
+      "st1 { v16.b }[6], [x23]\n"
+      "st1 { v23.b }[6], [x22]\n"
+      "st1 { v24.b }[6], [x21]\n"
       "b 126f\n"
       "123:"  // Height 4: Partial direct writeback: partial_1_4
       "tbz x9, #0, 126f\n"
       "st1 { v31.b }[4], [x27]\n"
-      "st1 { v16.b }[4], [x22]\n"
-      "st1 { v23.b }[4], [x21]\n"
-      "st1 { v24.b }[4], [x20]\n"
+      "st1 { v16.b }[4], [x23]\n"
+      "st1 { v23.b }[4], [x22]\n"
+      "st1 { v24.b }[4], [x21]\n"
       "b 126f\n"
       "124:"  // Height 4: Partial direct writeback: partial_2_0
       "tbz x9, #1, 125f\n"
       "str h31, [x27], #0x2\n"
-      "str h16, [x22], #0x2\n"
-      "str h23, [x21], #0x2\n"
-      "str h24, [x20], #0x2\n"
+      "str h16, [x23], #0x2\n"
+      "str h23, [x22], #0x2\n"
+      "str h24, [x21], #0x2\n"
       "tbz x9, #0, 126f\n"
       "st1 { v31.b }[2], [x27]\n"
-      "st1 { v16.b }[2], [x22]\n"
-      "st1 { v23.b }[2], [x21]\n"
-      "st1 { v24.b }[2], [x20]\n"
+      "st1 { v16.b }[2], [x23]\n"
+      "st1 { v23.b }[2], [x22]\n"
+      "st1 { v24.b }[2], [x21]\n"
       "b 126f\n"
       "125:"  // Height 4: Partial direct writeback: partial_1_0
       "str b31, [x27, #0x0]\n"
-      "str b16, [x22, #0x0]\n"
-      "str b23, [x21, #0x0]\n"
-      "str b24, [x20, #0x0]\n"
+      "str b16, [x23, #0x0]\n"
+      "str b23, [x22, #0x0]\n"
+      "str b24, [x21, #0x0]\n"
       "126:"  // Height 4: Partial direct writeback: Done
       "b 128f\n"
       "127:"  // Height 4: Full writeback
       "str q31, [x27, #0x0]\n"
       "add x27, x27, #0x10\n"
-      "str q16, [x22, #0x0]\n"
-      "str q23, [x21, #0x0]\n"
-      "str q24, [x20, #0x0]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q23, [x22, #0x0]\n"
+      "str q24, [x21, #0x0]\n"
       "128:"  // Height 4: Writeback done
       "subs x9, x9, #0x10\n"
       "bgt 98b\n"
@@ -2089,7 +2088,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "130:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
index ce96c1b..38bb7c6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -121,5 +121,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp
index 705f652..7f0fad7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp
@@ -77,7 +77,6 @@
     ka.N = N;
     ka.B_ptr = B_ptr;
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 171f\n"
@@ -165,11 +164,11 @@
       "14:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 15f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
       "cbnz x15, 16f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20\n"
@@ -186,129 +185,129 @@
       "blt 18f\n"
       "17:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
-      "ldr x12, [x16, #0x28]\n"
+      "ldr d17, [x16, #0x20]\n"
+      "ldr x20, [x16, #0x28]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x38]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      "ldr x12, [x16, #0x48]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x58]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      "ldr x12, [x16, #0x68]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr d7, [x16, #0x70]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x78]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      "ldr x12, [x16, #0x88]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x98]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      "ldr x12, [x16, #0xa8]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0xb8]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      "ldr x12, [x16, #0xc8]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0xd8]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      "ldr x12, [x16, #0xe8]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0xf8]\n"
-      "mov v7.d[1], x11\n"
+      "ldr d16, [x16, #0x30]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x38]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr d17, [x16, #0x40]\n"
+      "ldr x20, [x16, #0x48]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr d16, [x16, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr d17, [x16, #0x60]\n"
+      "ldr x20, [x16, #0x68]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr d16, [x16, #0x70]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x78]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr d17, [x16, #0x80]\n"
+      "ldr x20, [x16, #0x88]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr d16, [x16, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr d17, [x16, #0xa0]\n"
+      "ldr x20, [x16, #0xa8]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr d16, [x16, #0xb0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xb8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr d17, [x16, #0xc0]\n"
+      "ldr x20, [x16, #0xc8]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr d16, [x16, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr d17, [x16, #0xe0]\n"
+      "ldr x20, [x16, #0xe8]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr d16, [x16, #0xf0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xf8]\n"
+      "mov v16.d[1], x20\n"
       "add x13, x13, #0x10\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      "ldr x12, [x16, #0x8]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      "ldr x20, [x16, #0x8]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
       "ldr d0, [x13, #0x0]\n"
       "sub x14, x14, #0x10\n"
       "ldr d7, [x16, #0x10]\n"
       "cmp x14, #0x20\n"
-      "ldr x10, [x13, #0x8]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x18]\n"
-      "mov v0.d[1], x10\n"
-      "mov v7.d[1], x11\n"
+      "ldr x21, [x13, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x16, #0x18]\n"
+      "mov v0.d[1], x21\n"
+      "mov v7.d[1], x20\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       "bge 17b\n"
       "18:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q17, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x16, #0x40]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x50]\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x16, #0x60]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x16, #0x70]\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x16, #0x80]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x16, #0x90]\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x16, #0xa0]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x16, #0xb0]\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x16, #0xc0]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x16, #0xd0]\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x16, #0xe0]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x16, #0xf0]\n"
       "add x13, x13, #0x10\n"
       "sub x14, x14, #0x10\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
       "19:"  // Height 1: Multiply loop: Main loop skip
       "cbz x14, 24f\n"
       "cmp x14, #0x4\n"
       "blt 21f\n"
       "20:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s18, [x13], #0x4\n"
       "sub x14, x14, #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q16, [x16, #0x0]\n"
+      ".inst 0x6f92e208  // udot v8.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x6f92e209  // udot v9.4s, v16.16b, v18.4b[0]\n"
+      "ldr q17, [x16, #0x20]\n"
       "cmp x14, #0x4\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x6f92e22a  // udot v10.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x6f92e20b  // udot v11.4s, v16.16b, v18.4b[0]\n"
       "add x16, x16, #0x40\n"
       "bge 20b\n"
       "21:"  // Height 1: Multiply loop: Skip odd blocks
@@ -321,14 +320,14 @@
       "22:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x13, #0x0]\n"
       "23:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x0]\n"
+      ".inst 0x6f80e208  // udot v8.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x6f80e209  // udot v9.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x20]\n"
+      ".inst 0x6f80e20a  // udot v10.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
       "24:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -499,226 +498,226 @@
       "48:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 49f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
       "cbnz x15, 50f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20\n"
-      "add x9, x9, x20\n"
+      "add x12, x12, x20\n"
       "b 50f\n"
       "49:"  // Height 2: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20\n"
+      "add x12, x13, x21\n"
       "50:"  // Height 2: input setup done
       "cmp x14, #0x10\n"
       "blt 53f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
       "ldr q7, [x16, #0x10]\n"
       "blt 52f\n"
       "51:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x12, [x16, #0x28]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
+      "ldr d17, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr x12, [x16, #0x48]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x58]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr x12, [x16, #0x68]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr d7, [x16, #0x70]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr x12, [x16, #0x88]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x98]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr x12, [x16, #0xa8]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr x12, [x16, #0xc8]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0xd8]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr x12, [x16, #0xe8]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      "mov v6.d[1], x12\n"
+      "ldr d16, [x16, #0x30]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6f81e22e  // udot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr d17, [x16, #0x40]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr x20, [x16, #0x48]\n"
+      ".inst 0x6f81e20f  // udot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr d16, [x16, #0x50]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x58]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x6fa1e22c  // udot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr d17, [x16, #0x60]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x6fa1e20d  // udot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr d16, [x16, #0x70]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6fa1e22e  // udot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr d17, [x16, #0x80]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr x20, [x16, #0x88]\n"
+      ".inst 0x6fa1e20f  // udot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr d16, [x16, #0x90]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0x98]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x6f81ea2c  // udot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr d17, [x16, #0xa0]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x6f81ea0d  // udot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr d16, [x16, #0xb0]\n"
+      "mov v17.d[1], x21\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6f81ea2e  // udot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr d17, [x16, #0xc0]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr x20, [x16, #0xc8]\n"
+      ".inst 0x6f81ea0f  // udot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr d16, [x16, #0xd0]\n"
+      "mov v17.d[1], x20\n"
+      "ldr x20, [x16, #0xd8]\n"
+      "mov v16.d[1], x20\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x6fa1ea2c  // udot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr d17, [x16, #0xe0]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x6fa1ea0d  // udot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr d16, [x16, #0xf0]\n"
+      "mov v17.d[1], x21\n"
       "add x13, x13, #0x10\n"
-      "mov v7.d[1], x11\n"
-      "add x9, x9, #0x10\n"
+      "mov v16.d[1], x20\n"
+      "add x12, x12, #0x10\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea2e  // udot v14.4s, v17.16b, v1.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      "ldr x12, [x16, #0x8]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0x8]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
       "ldr d0, [x13, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      ".inst 0x6fa1ea0f  // udot v15.4s, v16.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
       "sub x14, x14, #0x10\n"
       "ldr d7, [x16, #0x10]\n"
       "cmp x14, #0x20\n"
-      "ldr x10, [x13, #0x8]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x28, [x9, #0x8]\n"
-      "mov v0.d[1], x10\n"
-      "ldr x11, [x16, #0x18]\n"
-      "mov v1.d[1], x28\n"
+      "ldr x20, [x13, #0x8]\n"
+      "mov v6.d[1], x21\n"
+      "ldr x21, [x12, #0x8]\n"
+      "mov v0.d[1], x20\n"
+      "ldr x20, [x16, #0x18]\n"
+      "mov v1.d[1], x21\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "mov v7.d[1], x11\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "mov v7.d[1], x20\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       "bge 51b\n"
       "52:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q17, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
       "sub x14, x14, #0x10\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e22e  // udot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x16, #0x40]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6f81e20f  // udot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x16, #0x50]\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x6fa1e22c  // udot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x16, #0x60]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e20d  // udot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x16, #0x70]\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e22e  // udot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x16, #0x80]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e20f  // udot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x16, #0x90]\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea2c  // udot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x16, #0xa0]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea0d  // udot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x16, #0xb0]\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea2e  // udot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x16, #0xc0]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea0f  // udot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x16, #0xd0]\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea2c  // udot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x16, #0xe0]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea0d  // udot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x16, #0xf0]\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa1ea2e  // udot v14.4s, v17.16b, v1.4b[3]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea0f  // udot v15.4s, v16.16b, v1.4b[3]\n"
       "53:"  // Height 2: Multiply loop: Main loop skip
       "cbz x14, 58f\n"
       "cmp x14, #0x4\n"
       "blt 55f\n"
       "54:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s19, [x13], #0x4\n"
       "sub x14, x14, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s18, [x12], #0x4\n"
       "cmp x14, #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q17, [x16, #0x0]\n"
+      ".inst 0x6f93e228  // udot v8.4s, v17.16b, v19.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x6f92e22c  // udot v12.4s, v17.16b, v18.4b[0]\n"
+      "ldr q17, [x16, #0x20]\n"
+      ".inst 0x6f93e209  // udot v9.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x6f92e20d  // udot v13.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x6f93e22a  // udot v10.4s, v17.16b, v19.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f92e22e  // udot v14.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x6f93e20b  // udot v11.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x6f92e20f  // udot v15.4s, v16.16b, v18.4b[0]\n"
       "bge 54b\n"
       "55:"  // Height 2: Multiply loop: Skip odd blocks
       "cbz x14, 58f\n"
       "tbz x14, #1, 56f\n"
       "ldr h0, [x13], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
       "tbz x14, #0, 57f\n"
       "ld1 { v0.b }[2], [x13]\n"
-      "ld1 { v1.b }[2], [x9]\n"
+      "ld1 { v1.b }[2], [x12]\n"
       "b 57f\n"
       "56:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x13, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
       "57:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q17, [x16, #0x0]\n"
+      ".inst 0x6f80e228  // udot v8.4s, v17.16b, v0.4b[0]\n"
+      "ldr q16, [x16, #0x10]\n"
+      ".inst 0x6f81e22c  // udot v12.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x16, #0x20]\n"
+      ".inst 0x6f80e209  // udot v9.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x6f81e20d  // udot v13.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x16, #0x30]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f81e22e  // udot v14.4s, v17.16b, v1.4b[0]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x6f81e20f  // udot v15.4s, v16.16b, v1.4b[0]\n"
       "58:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x15, x15, #0x1\n"
@@ -936,281 +935,281 @@
       "82:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 83f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
       "cbnz x15, 84f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20\n"
-      "add x9, x9, x20\n"
-      "add x27, x27, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
       "b 84f\n"
       "83:"  // Height 3: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20\n"
-      "add x27, x9, x20\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
       "84:"  // Height 3: input setup done
       "cmp x14, #0x10\n"
       "blt 87f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
       "ldr q7, [x16, #0x10]\n"
       "blt 86f\n"
       "85:"  // Height 3: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x12, [x16, #0x28]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
+      "ldr d21, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v21.d[1], x21\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x48]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x68]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0x88]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr d7, [x16, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0xa8]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xc8]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xe8]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr d20, [x16, #0x30]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x6f80e2aa  // udot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f81e2ae  // udot v14.4s, v21.16b, v1.4b[0]\n"
+      "ldr x20, [x16, #0x58]\n"
+      ".inst 0x6f82e2b2  // udot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr d21, [x16, #0x40]\n"
+      ".inst 0x6f80e28b  // udot v11.4s, v20.16b, v0.4b[0]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x6f81e28f  // udot v15.4s, v20.16b, v1.4b[0]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x6f82e293  // udot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr d20, [x16, #0x50]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x6fa0e2a8  // udot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ac  // udot v12.4s, v21.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x6fa2e2b0  // udot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr d21, [x16, #0x60]\n"
+      ".inst 0x6fa0e289  // udot v9.4s, v20.16b, v0.4b[1]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x6fa1e28d  // udot v13.4s, v20.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0x88]\n"
+      ".inst 0x6fa2e291  // udot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr d20, [x16, #0x70]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x6fa0e2aa  // udot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ae  // udot v14.4s, v21.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x98]\n"
+      ".inst 0x6fa2e2b2  // udot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr d21, [x16, #0x80]\n"
+      ".inst 0x6fa0e28b  // udot v11.4s, v20.16b, v0.4b[1]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x6fa1e28f  // udot v15.4s, v20.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x6fa2e293  // udot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr d20, [x16, #0x90]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x6f80eaa8  // udot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaac  // udot v12.4s, v21.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x6f82eab0  // udot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr d21, [x16, #0xa0]\n"
+      ".inst 0x6f80ea89  // udot v9.4s, v20.16b, v0.4b[2]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x6f81ea8d  // udot v13.4s, v20.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xc8]\n"
+      ".inst 0x6f82ea91  // udot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr d20, [x16, #0xb0]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x6f80eaaa  // udot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaae  // udot v14.4s, v21.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xd8]\n"
+      ".inst 0x6f82eab2  // udot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr d21, [x16, #0xc0]\n"
+      ".inst 0x6f80ea8b  // udot v11.4s, v20.16b, v0.4b[2]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x6f81ea8f  // udot v15.4s, v20.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x6f82ea93  // udot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr d20, [x16, #0xd0]\n"
+      "mov v20.d[1], x20\n"
+      ".inst 0x6fa0eaa8  // udot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eaac  // udot v12.4s, v21.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x6fa2eab0  // udot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr d21, [x16, #0xe0]\n"
+      ".inst 0x6fa0ea89  // udot v9.4s, v20.16b, v0.4b[3]\n"
+      "mov v21.d[1], x21\n"
+      ".inst 0x6fa1ea8d  // udot v13.4s, v20.16b, v1.4b[3]\n"
       "add x13, x13, #0x10\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      "mov v7.d[1], x11\n"
-      "add x9, x9, #0x10\n"
-      "add x27, x27, #0x10\n"
+      ".inst 0x6fa2ea91  // udot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr d20, [x16, #0xf0]\n"
+      "mov v20.d[1], x20\n"
+      "add x12, x12, #0x10\n"
+      "add x11, x11, #0x10\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr x12, [x16, #0x8]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x10, [x13, #0x8]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa0eaaa  // udot v10.4s, v21.16b, v0.4b[3]\n"
+      "ldr x20, [x16, #0x8]\n"
+      ".inst 0x6fa1eaae  // udot v14.4s, v21.16b, v1.4b[3]\n"
+      "ldr x23, [x13, #0x8]\n"
+      ".inst 0x6fa2eab2  // udot v18.4s, v21.16b, v2.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea8b  // udot v11.4s, v20.16b, v0.4b[3]\n"
       "ldr d0, [x13, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      "ldr x28, [x9, #0x8]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      ".inst 0x6fa1ea8f  // udot v15.4s, v20.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      "ldr x22, [x12, #0x8]\n"
+      ".inst 0x6fa2ea93  // udot v19.4s, v20.16b, v2.4b[3]\n"
+      "ldr d2, [x11, #0x0]\n"
       "sub x14, x14, #0x10\n"
       "ldr d7, [x16, #0x10]\n"
       "cmp x14, #0x20\n"
-      "ldr x26, [x27, #0x8]\n"
-      "mov v6.d[1], x12\n"
-      "ldr x11, [x16, #0x18]\n"
-      "mov v0.d[1], x10\n"
+      "ldr x21, [x11, #0x8]\n"
+      "mov v6.d[1], x20\n"
+      "ldr x20, [x16, #0x18]\n"
+      "mov v0.d[1], x23\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      "mov v1.d[1], x28\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      "mov v2.d[1], x26\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "mov v7.d[1], x11\n"
+      "mov v1.d[1], x22\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "mov v2.d[1], x21\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      "mov v7.d[1], x20\n"
       "bge 85b\n"
       "86:"  // Height 3: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q21, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
       "sub x14, x14, #0x10\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q20, [x16, #0x30]\n"
+      ".inst 0x6f80e2aa  // udot v10.4s, v21.16b, v0.4b[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6f81e2ae  // udot v14.4s, v21.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x6f82e2b2  // udot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x16, #0x40]\n"
+      ".inst 0x6f80e28b  // udot v11.4s, v20.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x6f81e28f  // udot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x6f82e293  // udot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x16, #0x50]\n"
+      ".inst 0x6fa0e2a8  // udot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ac  // udot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e2b0  // udot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x16, #0x60]\n"
+      ".inst 0x6fa0e289  // udot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e28d  // udot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e291  // udot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x16, #0x70]\n"
+      ".inst 0x6fa0e2aa  // udot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ae  // udot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e2b2  // udot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x16, #0x80]\n"
+      ".inst 0x6fa0e28b  // udot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e28f  // udot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e293  // udot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x16, #0x90]\n"
+      ".inst 0x6f80eaa8  // udot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaac  // udot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x6f82eab0  // udot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x16, #0xa0]\n"
+      ".inst 0x6f80ea89  // udot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea8d  // udot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x6f82ea91  // udot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x16, #0xb0]\n"
+      ".inst 0x6f80eaaa  // udot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaae  // udot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x6f82eab2  // udot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x16, #0xc0]\n"
+      ".inst 0x6f80ea8b  // udot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea8f  // udot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x6f82ea93  // udot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x16, #0xd0]\n"
+      ".inst 0x6fa0eaa8  // udot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eaac  // udot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eab0  // udot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x16, #0xe0]\n"
+      ".inst 0x6fa0ea89  // udot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea8d  // udot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ea91  // udot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x16, #0xf0]\n"
+      ".inst 0x6fa0eaaa  // udot v10.4s, v21.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa1eaae  // udot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eab2  // udot v18.4s, v21.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ea8b  // udot v11.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea8f  // udot v15.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ea93  // udot v19.4s, v20.16b, v2.4b[3]\n"
       "87:"  // Height 3: Multiply loop: Main loop skip
       "cbz x14, 92f\n"
       "cmp x14, #0x4\n"
       "blt 89f\n"
       "88:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s24, [x13], #0x4\n"
       "sub x14, x14, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s23, [x12], #0x4\n"
       "cmp x14, #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s22, [x11], #0x4\n"
+      "ldr q21, [x16, #0x0]\n"
+      ".inst 0x6f98e2a8  // udot v8.4s, v21.16b, v24.4b[0]\n"
+      "ldr q20, [x16, #0x10]\n"
+      ".inst 0x6f97e2ac  // udot v12.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x6f96e2b0  // udot v16.4s, v21.16b, v22.4b[0]\n"
+      "ldr q21, [x16, #0x20]\n"
+      ".inst 0x6f98e289  // udot v9.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x6f97e28d  // udot v13.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x6f96e291  // udot v17.4s, v20.16b, v22.4b[0]\n"
+      "ldr q20, [x16, #0x30]\n"
+      ".inst 0x6f98e2aa  // udot v10.4s, v21.16b, v24.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f97e2ae  // udot v14.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x6f96e2b2  // udot v18.4s, v21.16b, v22.4b[0]\n"
+      ".inst 0x6f98e28b  // udot v11.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x6f97e28f  // udot v15.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x6f96e293  // udot v19.4s, v20.16b, v22.4b[0]\n"
       "bge 88b\n"
       "89:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x14, 92f\n"
       "tbz x14, #1, 90f\n"
       "ldr h0, [x13], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
       "tbz x14, #0, 91f\n"
       "ld1 { v0.b }[2], [x13]\n"
-      "ld1 { v1.b }[2], [x9]\n"
-      "ld1 { v2.b }[2], [x27]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
       "b 91f\n"
       "90:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x13, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
-      "ldr b2, [x27, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
       "91:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q21, [x16, #0x0]\n"
+      ".inst 0x6f80e2a8  // udot v8.4s, v21.16b, v0.4b[0]\n"
+      "ldr q20, [x16, #0x10]\n"
+      ".inst 0x6f81e2ac  // udot v12.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x6f82e2b0  // udot v16.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x16, #0x20]\n"
+      ".inst 0x6f80e289  // udot v9.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x6f81e28d  // udot v13.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x6f82e291  // udot v17.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x16, #0x30]\n"
+      ".inst 0x6f80e2aa  // udot v10.4s, v21.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f81e2ae  // udot v14.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x6f82e2b2  // udot v18.4s, v21.16b, v2.4b[0]\n"
+      ".inst 0x6f80e28b  // udot v11.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x6f81e28f  // udot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x6f82e293  // udot v19.4s, v20.16b, v2.4b[0]\n"
       "92:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x15, x15, #0x1\n"
@@ -1475,336 +1474,336 @@
       "116:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 117f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
       "cbnz x15, 118f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20\n"
-      "add x9, x9, x20\n"
-      "add x27, x27, x20\n"
-      "add x25, x25, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
       "b 118f\n"
       "117:"  // Height 4: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20\n"
-      "add x27, x9, x20\n"
-      "add x25, x27, x20\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
       "118:"  // Height 4: input setup done
       "cmp x14, #0x10\n"
       "blt 121f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
       "ldr q7, [x16, #0x10]\n"
       "blt 120f\n"
       "119:"  // Height 4: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x12, [x16, #0x28]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
+      "ldr d25, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v25.d[1], x21\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x48]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x68]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      "add x25, x25, #0x10\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr x10, [x13, #0x8]\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0x88]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr x28, [x9, #0x8]\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr d7, [x16, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr x26, [x27, #0x8]\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0xa8]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr x24, [x25, #0x8]\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr d24, [x16, #0x30]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x6f80e32a  // udot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e32e  // udot v14.4s, v25.16b, v1.4b[0]\n"
+      "ldr x20, [x16, #0x58]\n"
+      ".inst 0x6f82e332  // udot v18.4s, v25.16b, v2.4b[0]\n"
+      "add x11, x11, #0x10\n"
+      ".inst 0x6f83e336  // udot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr d25, [x16, #0x40]\n"
+      ".inst 0x6f80e30b  // udot v11.4s, v24.16b, v0.4b[0]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x6f81e30f  // udot v15.4s, v24.16b, v1.4b[0]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x6f82e313  // udot v19.4s, v24.16b, v2.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x6f83e317  // udot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr d24, [x16, #0x50]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x6fa0e328  // udot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32c  // udot v12.4s, v25.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x6fa2e330  // udot v16.4s, v25.16b, v2.4b[1]\n"
+      "ldr x25, [x13, #0x8]\n"
+      ".inst 0x6fa3e334  // udot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr d25, [x16, #0x60]\n"
+      ".inst 0x6fa0e309  // udot v9.4s, v24.16b, v0.4b[1]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x6fa1e30d  // udot v13.4s, v24.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0x88]\n"
+      ".inst 0x6fa2e311  // udot v17.4s, v24.16b, v2.4b[1]\n"
+      "ldr x24, [x12, #0x8]\n"
+      ".inst 0x6fa3e315  // udot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr d24, [x16, #0x70]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x6fa0e32a  // udot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32e  // udot v14.4s, v25.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x98]\n"
+      ".inst 0x6fa2e332  // udot v18.4s, v25.16b, v2.4b[1]\n"
+      "ldr x23, [x11, #0x8]\n"
+      ".inst 0x6fa3e336  // udot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr d25, [x16, #0x80]\n"
+      ".inst 0x6fa0e30b  // udot v11.4s, v24.16b, v0.4b[1]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x6fa1e30f  // udot v15.4s, v24.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x6fa2e313  // udot v19.4s, v24.16b, v2.4b[1]\n"
+      "ldr x22, [x10, #0x8]\n"
+      ".inst 0x6fa3e317  // udot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr d24, [x16, #0x90]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x6f80eb28  // udot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2c  // udot v12.4s, v25.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x6f82eb30  // udot v16.4s, v25.16b, v2.4b[2]\n"
       "sub x14, x14, #0x10\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xc8]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb34  // udot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr d25, [x16, #0xa0]\n"
+      ".inst 0x6f80eb09  // udot v9.4s, v24.16b, v0.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x6f81eb0d  // udot v13.4s, v24.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xc8]\n"
+      ".inst 0x6f82eb11  // udot v17.4s, v24.16b, v2.4b[2]\n"
       "cmp x14, #0x20\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb15  // udot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr d24, [x16, #0xb0]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x6f80eb2a  // udot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2e  // udot v14.4s, v25.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xd8]\n"
+      ".inst 0x6f82eb32  // udot v18.4s, v25.16b, v2.4b[2]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xe8]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      "mov v7.d[1], x11\n"
+      ".inst 0x6f83eb36  // udot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr d25, [x16, #0xc0]\n"
+      ".inst 0x6f80eb0b  // udot v11.4s, v24.16b, v0.4b[2]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x6f81eb0f  // udot v15.4s, v24.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x6f82eb13  // udot v19.4s, v24.16b, v2.4b[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x6f83eb17  // udot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr d24, [x16, #0xd0]\n"
+      "mov v24.d[1], x20\n"
+      ".inst 0x6fa0eb28  // udot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb2c  // udot v12.4s, v25.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x6fa2eb30  // udot v16.4s, v25.16b, v2.4b[3]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x6fa3eb34  // udot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr d25, [x16, #0xe0]\n"
+      ".inst 0x6fa0eb09  // udot v9.4s, v24.16b, v0.4b[3]\n"
+      "mov v25.d[1], x21\n"
+      ".inst 0x6fa1eb0d  // udot v13.4s, v24.16b, v1.4b[3]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6fa2eb11  // udot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb15  // udot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr d24, [x16, #0xf0]\n"
+      "mov v24.d[1], x20\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr x12, [x16, #0x8]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x16, #0x18]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa0eb2a  // udot v10.4s, v25.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0x8]\n"
+      ".inst 0x6fa1eb2e  // udot v14.4s, v25.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0x18]\n"
+      ".inst 0x6fa2eb32  // udot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb36  // udot v22.4s, v25.16b, v3.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eb0b  // udot v11.4s, v24.16b, v0.4b[3]\n"
       "ldr d0, [x13, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr d2, [x27, #0x0]\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr d3, [x25, #0x0]\n"
+      ".inst 0x6fa1eb0f  // udot v15.4s, v24.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      ".inst 0x6fa2eb13  // udot v19.4s, v24.16b, v2.4b[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      ".inst 0x6fa3eb17  // udot v23.4s, v24.16b, v3.4b[3]\n"
+      "ldr d3, [x10, #0x0]\n"
       "ldr d7, [x16, #0x10]\n"
-      "mov v6.d[1], x12\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
-      "mov v3.d[1], x24\n"
-      "mov v7.d[1], x11\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x25\n"
+      "mov v1.d[1], x24\n"
+      "mov v2.d[1], x23\n"
+      "mov v3.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 119b\n"
       "120:"  // Height 4: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q25, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
       "sub x14, x14, #0x10\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q24, [x16, #0x30]\n"
+      ".inst 0x6f80e32a  // udot v10.4s, v25.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x6f81e32e  // udot v14.4s, v25.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x6f82e332  // udot v18.4s, v25.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f83e336  // udot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x16, #0x40]\n"
+      ".inst 0x6f80e30b  // udot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e30f  // udot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x6f82e313  // udot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x6f83e317  // udot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x16, #0x50]\n"
+      ".inst 0x6fa0e328  // udot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32c  // udot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e330  // udot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e334  // udot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x16, #0x60]\n"
+      ".inst 0x6fa0e309  // udot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e30d  // udot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e311  // udot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e315  // udot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x16, #0x70]\n"
+      ".inst 0x6fa0e32a  // udot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32e  // udot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e332  // udot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e336  // udot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x16, #0x80]\n"
+      ".inst 0x6fa0e30b  // udot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e30f  // udot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e313  // udot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e317  // udot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x16, #0x90]\n"
+      ".inst 0x6f80eb28  // udot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2c  // udot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb30  // udot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb34  // udot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x16, #0xa0]\n"
+      ".inst 0x6f80eb09  // udot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb0d  // udot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb11  // udot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb15  // udot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x16, #0xb0]\n"
+      ".inst 0x6f80eb2a  // udot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2e  // udot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb32  // udot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb36  // udot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x16, #0xc0]\n"
+      ".inst 0x6f80eb0b  // udot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb0f  // udot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb13  // udot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb17  // udot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x16, #0xd0]\n"
+      ".inst 0x6fa0eb28  // udot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb2c  // udot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb30  // udot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb34  // udot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x16, #0xe0]\n"
+      ".inst 0x6fa0eb09  // udot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb0d  // udot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb11  // udot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb15  // udot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x16, #0xf0]\n"
+      ".inst 0x6fa0eb2a  // udot v10.4s, v25.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa1eb2e  // udot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb32  // udot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb36  // udot v22.4s, v25.16b, v3.4b[3]\n"
+      ".inst 0x6fa0eb0b  // udot v11.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb0f  // udot v15.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb13  // udot v19.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb17  // udot v23.4s, v24.16b, v3.4b[3]\n"
       "121:"  // Height 4: Multiply loop: Main loop skip
       "cbz x14, 126f\n"
       "cmp x14, #0x4\n"
       "blt 123f\n"
       "122:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s29, [x13], #0x4\n"
       "sub x14, x14, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s28, [x12], #0x4\n"
       "cmp x14, #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s27, [x11], #0x4\n"
+      "ldr s26, [x10], #0x4\n"
+      "ldr q25, [x16, #0x0]\n"
+      ".inst 0x6f9de328  // udot v8.4s, v25.16b, v29.4b[0]\n"
+      "ldr q24, [x16, #0x10]\n"
+      ".inst 0x6f9ce32c  // udot v12.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x6f9be330  // udot v16.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae334  // udot v20.4s, v25.16b, v26.4b[0]\n"
+      "ldr q25, [x16, #0x20]\n"
+      ".inst 0x6f9de309  // udot v9.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x6f9ce30d  // udot v13.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x6f9be311  // udot v17.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae315  // udot v21.4s, v24.16b, v26.4b[0]\n"
+      "ldr q24, [x16, #0x30]\n"
+      ".inst 0x6f9de32a  // udot v10.4s, v25.16b, v29.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f9ce32e  // udot v14.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x6f9be332  // udot v18.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae336  // udot v22.4s, v25.16b, v26.4b[0]\n"
+      ".inst 0x6f9de30b  // udot v11.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x6f9ce30f  // udot v15.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x6f9be313  // udot v19.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae317  // udot v23.4s, v24.16b, v26.4b[0]\n"
       "bge 122b\n"
       "123:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x14, 126f\n"
       "tbz x14, #1, 124f\n"
       "ldr h0, [x13], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
-      "ldr h3, [x25], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "ldr h3, [x10], #0x2\n"
       "tbz x14, #0, 125f\n"
       "ld1 { v0.b }[2], [x13]\n"
-      "ld1 { v1.b }[2], [x9]\n"
-      "ld1 { v2.b }[2], [x27]\n"
-      "ld1 { v3.b }[2], [x25]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
+      "ld1 { v3.b }[2], [x10]\n"
       "b 125f\n"
       "124:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x13, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
-      "ldr b2, [x27, #0x0]\n"
-      "ldr b3, [x25, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
+      "ldr b3, [x10, #0x0]\n"
       "125:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q25, [x16, #0x0]\n"
+      ".inst 0x6f80e328  // udot v8.4s, v25.16b, v0.4b[0]\n"
+      "ldr q24, [x16, #0x10]\n"
+      ".inst 0x6f81e32c  // udot v12.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x6f82e330  // udot v16.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x6f83e334  // udot v20.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x16, #0x20]\n"
+      ".inst 0x6f80e309  // udot v9.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e30d  // udot v13.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x6f82e311  // udot v17.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x6f83e315  // udot v21.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x16, #0x30]\n"
+      ".inst 0x6f80e32a  // udot v10.4s, v25.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f81e32e  // udot v14.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x6f82e332  // udot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x6f83e336  // udot v22.4s, v25.16b, v3.4b[0]\n"
+      ".inst 0x6f80e30b  // udot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e30f  // udot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x6f82e313  // udot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x6f83e317  // udot v23.4s, v24.16b, v3.4b[0]\n"
       "126:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x15, x15, #0x1\n"
@@ -2116,391 +2115,391 @@
       "150:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 151f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x23, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
       "cbnz x15, 152f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
       "add x9, x9, x20\n"
-      "add x27, x27, x20\n"
-      "add x25, x25, x20\n"
-      "add x23, x23, x20\n"
       "b 152f\n"
       "151:"  // Height 5: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20\n"
-      "add x27, x9, x20\n"
-      "add x25, x27, x20\n"
-      "add x23, x25, x20\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
+      "add x9, x10, x21\n"
       "152:"  // Height 5: input setup done
       "cmp x14, #0x10\n"
       "blt 155f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x23, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
       "ldr q7, [x16, #0x10]\n"
       "blt 154f\n"
       "153:"  // Height 5: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x12, [x16, #0x28]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr d6, [x16, #0x20]\n"
+      "ldr d29, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v29.d[1], x21\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x48]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr d7, [x16, #0x30]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr x10, [x13, #0x8]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr d6, [x16, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x68]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr x28, [x9, #0x8]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr x26, [x27, #0x8]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr d7, [x16, #0x50]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr x24, [x25, #0x8]\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr x22, [x23, #0x8]\n"
-      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr d6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0x88]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr d28, [x16, #0x30]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x6f80e3aa  // udot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
+      "ldr x20, [x16, #0x58]\n"
+      ".inst 0x6f82e3b2  // udot v18.4s, v29.16b, v2.4b[0]\n"
+      "add x9, x9, #0x10\n"
+      ".inst 0x6f83e3b6  // udot v22.4s, v29.16b, v3.4b[0]\n"
+      "ldr x26, [x13, #0x8]\n"
+      ".inst 0x6f84e3ba  // udot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr d29, [x16, #0x40]\n"
+      ".inst 0x6f80e38b  // udot v11.4s, v28.16b, v0.4b[0]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      "ldr x21, [x16, #0x68]\n"
+      ".inst 0x6f82e393  // udot v19.4s, v28.16b, v2.4b[0]\n"
+      "ldr x25, [x12, #0x8]\n"
+      ".inst 0x6f83e397  // udot v23.4s, v28.16b, v3.4b[0]\n"
+      "ldr x24, [x11, #0x8]\n"
+      ".inst 0x6f84e39b  // udot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr d28, [x16, #0x50]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x6fa0e3a8  // udot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ac  // udot v12.4s, v29.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x78]\n"
+      ".inst 0x6fa2e3b0  // udot v16.4s, v29.16b, v2.4b[1]\n"
+      "ldr x23, [x10, #0x8]\n"
+      ".inst 0x6fa3e3b4  // udot v20.4s, v29.16b, v3.4b[1]\n"
+      "ldr x22, [x9, #0x8]\n"
+      ".inst 0x6fa4e3b8  // udot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr d29, [x16, #0x60]\n"
+      ".inst 0x6fa0e389  // udot v9.4s, v28.16b, v0.4b[1]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x6fa1e38d  // udot v13.4s, v28.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0x88]\n"
+      ".inst 0x6fa2e391  // udot v17.4s, v28.16b, v2.4b[1]\n"
       "sub x14, x14, #0x10\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa3e395  // udot v21.4s, v28.16b, v3.4b[1]\n"
       "cmp x14, #0x20\n"
-      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr d7, [x16, #0x70]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa4e399  // udot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr d28, [x16, #0x70]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x6fa0e3aa  // udot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ae  // udot v14.4s, v29.16b, v1.4b[1]\n"
+      "ldr x20, [x16, #0x98]\n"
+      ".inst 0x6fa2e3b2  // udot v18.4s, v29.16b, v2.4b[1]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa3e3b6  // udot v22.4s, v29.16b, v3.4b[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      ".inst 0x6fa4e3ba  // udot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr d29, [x16, #0x80]\n"
+      ".inst 0x6fa0e38b  // udot v11.4s, v28.16b, v0.4b[1]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x6fa1e38f  // udot v15.4s, v28.16b, v1.4b[1]\n"
+      "ldr x21, [x16, #0xa8]\n"
+      ".inst 0x6fa2e393  // udot v19.4s, v28.16b, v2.4b[1]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x6fa3e397  // udot v23.4s, v28.16b, v3.4b[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6fa4e39b  // udot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr d28, [x16, #0x90]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x6f80eba8  // udot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebac  // udot v12.4s, v29.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xb8]\n"
+      ".inst 0x6f82ebb0  // udot v16.4s, v29.16b, v2.4b[2]\n"
       "prfm pldl1keep, [x9, #0x80]\n"
-      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr d6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0xa8]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr d7, [x16, #0x90]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr d6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xc8]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr d7, [x16, #0xb0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr d6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xe8]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr d7, [x16, #0xd0]\n"
-      "mov v7.d[1], x11\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr d6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x12\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr d7, [x16, #0xf0]\n"
-      "mov v7.d[1], x11\n"
+      ".inst 0x6f83ebb4  // udot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebb8  // udot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr d29, [x16, #0xa0]\n"
+      ".inst 0x6f80eb89  // udot v9.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x6f81eb8d  // udot v13.4s, v28.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xc8]\n"
+      ".inst 0x6f82eb91  // udot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb95  // udot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb99  // udot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr d28, [x16, #0xb0]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x6f80ebaa  // udot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebae  // udot v14.4s, v29.16b, v1.4b[2]\n"
+      "ldr x20, [x16, #0xd8]\n"
+      ".inst 0x6f82ebb2  // udot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x6f83ebb6  // udot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebba  // udot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr d29, [x16, #0xc0]\n"
+      ".inst 0x6f80eb8b  // udot v11.4s, v28.16b, v0.4b[2]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x6f81eb8f  // udot v15.4s, v28.16b, v1.4b[2]\n"
+      "ldr x21, [x16, #0xe8]\n"
+      ".inst 0x6f82eb93  // udot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb97  // udot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb9b  // udot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr d28, [x16, #0xd0]\n"
+      "mov v28.d[1], x20\n"
+      ".inst 0x6fa0eba8  // udot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebac  // udot v12.4s, v29.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0xf8]\n"
+      ".inst 0x6fa2ebb0  // udot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb4  // udot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebb8  // udot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr d29, [x16, #0xe0]\n"
+      ".inst 0x6fa0eb89  // udot v9.4s, v28.16b, v0.4b[3]\n"
+      "mov v29.d[1], x21\n"
+      ".inst 0x6fa1eb8d  // udot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb91  // udot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb95  // udot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x6fa4eb99  // udot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr d28, [x16, #0xf0]\n"
+      "mov v28.d[1], x20\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr x12, [x16, #0x8]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x16, #0x18]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa0ebaa  // udot v10.4s, v29.16b, v0.4b[3]\n"
+      "ldr x21, [x16, #0x8]\n"
+      ".inst 0x6fa1ebae  // udot v14.4s, v29.16b, v1.4b[3]\n"
+      "ldr x20, [x16, #0x18]\n"
+      ".inst 0x6fa2ebb2  // udot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb6  // udot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebba  // udot v26.4s, v29.16b, v4.4b[3]\n"
       "ldr d6, [x16, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eb8b  // udot v11.4s, v28.16b, v0.4b[3]\n"
       "ldr d0, [x13, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr d2, [x27, #0x0]\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr d3, [x25, #0x0]\n"
-      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
-      "ldr d4, [x23, #0x0]\n"
+      ".inst 0x6fa1eb8f  // udot v15.4s, v28.16b, v1.4b[3]\n"
+      "ldr d1, [x12, #0x0]\n"
+      ".inst 0x6fa2eb93  // udot v19.4s, v28.16b, v2.4b[3]\n"
+      "ldr d2, [x11, #0x0]\n"
+      ".inst 0x6fa3eb97  // udot v23.4s, v28.16b, v3.4b[3]\n"
+      "ldr d3, [x10, #0x0]\n"
+      ".inst 0x6fa4eb9b  // udot v27.4s, v28.16b, v4.4b[3]\n"
+      "ldr d4, [x9, #0x0]\n"
       "ldr d7, [x16, #0x10]\n"
-      "mov v6.d[1], x12\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
-      "mov v3.d[1], x24\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x26\n"
+      "mov v1.d[1], x25\n"
+      "mov v2.d[1], x24\n"
+      "mov v3.d[1], x23\n"
       "mov v4.d[1], x22\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "bge 153b\n"
       "154:"  // Height 5: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
+      "ldr q29, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
       "sub x14, x14, #0x10\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x16, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x16, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x16, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x16, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x16, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x16, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x16, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x16, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x16, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x16, #0xf0]\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "ldr q28, [x16, #0x30]\n"
+      ".inst 0x6f80e3aa  // udot v10.4s, v29.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e3b2  // udot v18.4s, v29.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
+      ".inst 0x6f83e3b6  // udot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x6f84e3ba  // udot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x16, #0x40]\n"
+      ".inst 0x6f80e38b  // udot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e393  // udot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f83e397  // udot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x6f84e39b  // udot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x16, #0x50]\n"
+      ".inst 0x6fa0e3a8  // udot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ac  // udot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3b0  // udot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e3b4  // udot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e3b8  // udot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x16, #0x60]\n"
+      ".inst 0x6fa0e389  // udot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e38d  // udot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e391  // udot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e395  // udot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e399  // udot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x16, #0x70]\n"
+      ".inst 0x6fa0e3aa  // udot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ae  // udot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3b2  // udot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e3b6  // udot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e3ba  // udot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x16, #0x80]\n"
+      ".inst 0x6fa0e38b  // udot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e38f  // udot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e393  // udot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e397  // udot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e39b  // udot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x16, #0x90]\n"
+      ".inst 0x6f80eba8  // udot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebac  // udot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f82ebb0  // udot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x6f83ebb4  // udot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebb8  // udot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x16, #0xa0]\n"
+      ".inst 0x6f80eb89  // udot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb8d  // udot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb91  // udot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb95  // udot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb99  // udot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x16, #0xb0]\n"
+      ".inst 0x6f80ebaa  // udot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebae  // udot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f82ebb2  // udot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x6f83ebb6  // udot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebba  // udot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x16, #0xc0]\n"
+      ".inst 0x6f80eb8b  // udot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb8f  // udot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb93  // udot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb97  // udot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb9b  // udot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x16, #0xd0]\n"
+      ".inst 0x6fa0eba8  // udot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebac  // udot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebb0  // udot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb4  // udot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebb8  // udot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x16, #0xe0]\n"
+      ".inst 0x6fa0eb89  // udot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb8d  // udot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb91  // udot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb95  // udot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x6fa4eb99  // udot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x16, #0xf0]\n"
+      ".inst 0x6fa0ebaa  // udot v10.4s, v29.16b, v0.4b[3]\n"
       "add x16, x16, #0x100\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x6fa1ebae  // udot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebb2  // udot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb6  // udot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebba  // udot v26.4s, v29.16b, v4.4b[3]\n"
+      ".inst 0x6fa0eb8b  // udot v11.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb8f  // udot v15.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb93  // udot v19.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb97  // udot v23.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x6fa4eb9b  // udot v27.4s, v28.16b, v4.4b[3]\n"
       "155:"  // Height 5: Multiply loop: Main loop skip
       "cbz x14, 160f\n"
       "cmp x14, #0x4\n"
       "blt 157f\n"
       "156:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s2, [x13], #0x4\n"
       "sub x14, x14, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
       "cmp x14, #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x23], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s0, [x11], #0x4\n"
+      "ldr s31, [x10], #0x4\n"
+      "ldr s30, [x9], #0x4\n"
+      "ldr q29, [x16, #0x0]\n"
+      ".inst 0x6f82e3a8  // udot v8.4s, v29.16b, v2.4b[0]\n"
+      "ldr q28, [x16, #0x10]\n"
+      ".inst 0x6f81e3ac  // udot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f80e3b0  // udot v16.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe3b4  // udot v20.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee3b8  // udot v24.4s, v29.16b, v30.4b[0]\n"
+      "ldr q29, [x16, #0x20]\n"
+      ".inst 0x6f82e389  // udot v9.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f81e38d  // udot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f80e391  // udot v17.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe395  // udot v21.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee399  // udot v25.4s, v28.16b, v30.4b[0]\n"
+      "ldr q28, [x16, #0x30]\n"
+      ".inst 0x6f82e3aa  // udot v10.4s, v29.16b, v2.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f80e3b2  // udot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe3b6  // udot v22.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee3ba  // udot v26.4s, v29.16b, v30.4b[0]\n"
+      ".inst 0x6f82e38b  // udot v11.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f80e393  // udot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe397  // udot v23.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee39b  // udot v27.4s, v28.16b, v30.4b[0]\n"
       "bge 156b\n"
       "157:"  // Height 5: Multiply loop: Skip odd blocks
       "cbz x14, 160f\n"
       "tbz x14, #1, 158f\n"
       "ldr h0, [x13], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
-      "ldr h3, [x25], #0x2\n"
-      "ldr h4, [x23], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "ldr h3, [x10], #0x2\n"
+      "ldr h4, [x9], #0x2\n"
       "tbz x14, #0, 159f\n"
       "ld1 { v0.b }[2], [x13]\n"
-      "ld1 { v1.b }[2], [x9]\n"
-      "ld1 { v2.b }[2], [x27]\n"
-      "ld1 { v3.b }[2], [x25]\n"
-      "ld1 { v4.b }[2], [x23]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
+      "ld1 { v3.b }[2], [x10]\n"
+      "ld1 { v4.b }[2], [x9]\n"
       "b 159f\n"
       "158:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x13, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
-      "ldr b2, [x27, #0x0]\n"
-      "ldr b3, [x25, #0x0]\n"
-      "ldr b4, [x23, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
+      "ldr b3, [x10, #0x0]\n"
+      "ldr b4, [x9, #0x0]\n"
       "159:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q29, [x16, #0x0]\n"
+      ".inst 0x6f80e3a8  // udot v8.4s, v29.16b, v0.4b[0]\n"
+      "ldr q28, [x16, #0x10]\n"
+      ".inst 0x6f81e3ac  // udot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3b0  // udot v16.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f83e3b4  // udot v20.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x6f84e3b8  // udot v24.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x16, #0x20]\n"
+      ".inst 0x6f80e389  // udot v9.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e38d  // udot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e391  // udot v17.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f83e395  // udot v21.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x6f84e399  // udot v25.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x16, #0x30]\n"
+      ".inst 0x6f80e3aa  // udot v10.4s, v29.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3b2  // udot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f83e3b6  // udot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x6f84e3ba  // udot v26.4s, v29.16b, v4.4b[0]\n"
+      ".inst 0x6f80e38b  // udot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e393  // udot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f83e397  // udot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x6f84e39b  // udot v27.4s, v28.16b, v4.4b[0]\n"
       "160:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x15, x15, #0x1\n"
@@ -2862,98 +2861,98 @@
       "184:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w14, [x20, x15, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 185f\n"
-      "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x13, [x21, #0x0]\n"
-      "ldr x9, [x21, #0x8]\n"
-      "ldr x27, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x23, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x13, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x11, [x20, #0x10]\n"
+      "ldr x10, [x20, #0x18]\n"
+      "ldr x9, [x20, #0x20]\n"
+      "ldr x28, [x20, #0x28]\n"
       "cbnz x15, 186f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x13, x13, x20\n"
+      "add x12, x12, x20\n"
+      "add x11, x11, x20\n"
+      "add x10, x10, x20\n"
       "add x9, x9, x20\n"
-      "add x27, x27, x20\n"
-      "add x25, x25, x20\n"
-      "add x23, x23, x20\n"
-      "add x21, x21, x20\n"
+      "add x28, x28, x20\n"
       "b 186f\n"
       "185:"  // Height 6: setup direct input
       "mov x13, %x[input_ptr]\n"
-      "add x9, x13, x20\n"
-      "add x27, x9, x20\n"
-      "add x25, x27, x20\n"
-      "add x23, x25, x20\n"
-      "add x21, x23, x20\n"
+      "add x12, x13, x21\n"
+      "add x11, x12, x21\n"
+      "add x10, x11, x21\n"
+      "add x9, x10, x21\n"
+      "add x28, x9, x21\n"
       "186:"  // Height 6: input setup done
       "cmp x14, #0x10\n"
       "blt 189f\n"
       "ldr q0, [x13, #0x0]\n"
       "cmp x14, #0x20\n"
-      "ldr q1, [x9, #0x0]\n"
-      "ldr q2, [x27, #0x0]\n"
-      "ldr q3, [x25, #0x0]\n"
-      "ldr q4, [x23, #0x0]\n"
-      "ldr q5, [x21, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x11, #0x0]\n"
+      "ldr q3, [x10, #0x0]\n"
+      "ldr q4, [x9, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
       "ldr q6, [x16, #0x0]\n"
       "ldr q7, [x16, #0x10]\n"
       "blt 188f\n"
       "187:"  // Height 6: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr x12, [x16, #0x28]\n"
+      "ldr x21, [x16, #0x28]\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x38]\n"
+      "ldr x20, [x16, #0x38]\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
       "ldr d6, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x48]\n"
+      "ldr x21, [x16, #0x48]\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "add x21, x21, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
       "ldr d7, [x16, #0x30]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr x11, [x16, #0x58]\n"
+      "ldr x20, [x16, #0x58]\n"
       ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr x10, [x13, #0x8]\n"
+      "ldr x27, [x13, #0x8]\n"
       ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr x28, [x9, #0x8]\n"
+      "ldr x26, [x12, #0x8]\n"
       ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr x26, [x27, #0x8]\n"
+      "ldr x25, [x11, #0x8]\n"
       ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
       "ldr d6, [x16, #0x40]\n"
       ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr x12, [x16, #0x68]\n"
+      "ldr x21, [x16, #0x68]\n"
       ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr x24, [x25, #0x8]\n"
+      "ldr x24, [x10, #0x8]\n"
       ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr x22, [x23, #0x8]\n"
+      "ldr x23, [x9, #0x8]\n"
       ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr x20, [x21, #0x8]\n"
+      "ldr x22, [x28, #0x8]\n"
       ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
       "ldr d7, [x16, #0x50]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x78]\n"
+      "ldr x20, [x16, #0x78]\n"
       ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
       "sub x14, x14, #0x10\n"
       ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
@@ -2963,96 +2962,96 @@
       ".inst 0x6fa5e0dc  // udot v28.4s, v6.16b, v5.4b[1]\n"
       "ldr d6, [x16, #0x60]\n"
       ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0x88]\n"
+      "ldr x21, [x16, #0x88]\n"
       ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       ".inst 0x6fa5e0fd  // udot v29.4s, v7.16b, v5.4b[1]\n"
       "ldr d7, [x16, #0x70]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
       ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr x11, [x16, #0x98]\n"
+      "ldr x20, [x16, #0x98]\n"
       ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
       ".inst 0x6fa5e0de  // udot v30.4s, v6.16b, v5.4b[1]\n"
       "ldr d6, [x16, #0x80]\n"
       ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr x12, [x16, #0xa8]\n"
+      "ldr x21, [x16, #0xa8]\n"
       ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
       ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
       ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
       ".inst 0x6fa5e0ff  // udot v31.4s, v7.16b, v5.4b[1]\n"
       "ldr d7, [x16, #0x90]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xb8]\n"
+      "ldr x20, [x16, #0xb8]\n"
       ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x6f85e8dc  // udot v28.4s, v6.16b, v5.4b[2]\n"
       "ldr d6, [x16, #0xa0]\n"
       ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xc8]\n"
+      "ldr x21, [x16, #0xc8]\n"
       ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x6f85e8fd  // udot v29.4s, v7.16b, v5.4b[2]\n"
       "ldr d7, [x16, #0xb0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
       ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr x11, [x16, #0xd8]\n"
+      "ldr x20, [x16, #0xd8]\n"
       ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
       ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
       ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
       ".inst 0x6f85e8de  // udot v30.4s, v6.16b, v5.4b[2]\n"
       "ldr d6, [x16, #0xc0]\n"
       ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr x12, [x16, #0xe8]\n"
+      "ldr x21, [x16, #0xe8]\n"
       ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
       ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
       ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
       ".inst 0x6f85e8ff  // udot v31.4s, v7.16b, v5.4b[2]\n"
       "ldr d7, [x16, #0xd0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
       ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x16, #0xf8]\n"
+      "ldr x20, [x16, #0xf8]\n"
       ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
       ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
       ".inst 0x6fa5e8dc  // udot v28.4s, v6.16b, v5.4b[3]\n"
       "ldr d6, [x16, #0xe0]\n"
       ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "mov v6.d[1], x12\n"
+      "mov v6.d[1], x21\n"
       ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
       ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
       ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
       ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
       ".inst 0x6fa5e8fd  // udot v29.4s, v7.16b, v5.4b[3]\n"
       "ldr d7, [x16, #0xf0]\n"
-      "mov v7.d[1], x11\n"
+      "mov v7.d[1], x20\n"
       "add x16, x16, #0x100\n"
       ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      "ldr x12, [x16, #0x8]\n"
+      "ldr x21, [x16, #0x8]\n"
       ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      "ldr x11, [x16, #0x18]\n"
+      "ldr x20, [x16, #0x18]\n"
       ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
       ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
       ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
@@ -3061,56 +3060,56 @@
       ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
       "ldr d0, [x13, #0x0]\n"
       ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      "ldr d1, [x9, #0x0]\n"
+      "ldr d1, [x12, #0x0]\n"
       ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      "ldr d2, [x27, #0x0]\n"
+      "ldr d2, [x11, #0x0]\n"
       ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
-      "ldr d3, [x25, #0x0]\n"
+      "ldr d3, [x10, #0x0]\n"
       ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
-      "ldr d4, [x23, #0x0]\n"
+      "ldr d4, [x9, #0x0]\n"
       ".inst 0x6fa5e8ff  // udot v31.4s, v7.16b, v5.4b[3]\n"
-      "ldr d5, [x21, #0x0]\n"
+      "ldr d5, [x28, #0x0]\n"
       "ldr d7, [x16, #0x10]\n"
-      "mov v6.d[1], x12\n"
-      "mov v0.d[1], x10\n"
-      "mov v1.d[1], x28\n"
-      "mov v2.d[1], x26\n"
+      "mov v6.d[1], x21\n"
+      "mov v0.d[1], x27\n"
+      "mov v1.d[1], x26\n"
+      "mov v2.d[1], x25\n"
       "mov v3.d[1], x24\n"
-      "mov v4.d[1], x22\n"
-      "mov v5.d[1], x20\n"
-      "mov v7.d[1], x11\n"
+      "mov v4.d[1], x23\n"
+      "mov v5.d[1], x22\n"
+      "mov v7.d[1], x20\n"
       "bge 187b\n"
       "188:"  // Height 6: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
       "add x13, x13, #0x10\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "add x9, x9, #0x10\n"
+      "add x12, x12, #0x10\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "add x27, x27, #0x10\n"
+      "add x11, x11, #0x10\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "add x25, x25, #0x10\n"
+      "add x10, x10, #0x10\n"
       ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "add x23, x23, #0x10\n"
+      "add x9, x9, #0x10\n"
       ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
       "ldr q6, [x16, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "add x21, x21, #0x10\n"
+      "add x28, x28, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
       "sub x14, x14, #0x10\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
       "prfm pldl1keep, [x13, #0x80]\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "prfm pldl1keep, [x9, #0x80]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
       ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
+      "prfm pldl1keep, [x11, #0x80]\n"
       ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
       "ldr q7, [x16, #0x30]\n"
       ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
       ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x9, #0x80]\n"
       ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
       ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
       ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
       ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
@@ -3210,98 +3209,98 @@
       "cmp x14, #0x4\n"
       "blt 191f\n"
       "190:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x13], #0x4\n"
+      "ldr s7, [x13], #0x4\n"
       "sub x14, x14, #0x4\n"
-      "ldr s1, [x9], #0x4\n"
+      "ldr s6, [x12], #0x4\n"
       "cmp x14, #0x4\n"
-      "ldr s2, [x27], #0x4\n"
-      "ldr s3, [x25], #0x4\n"
-      "ldr s4, [x23], #0x4\n"
-      "ldr s5, [x21], #0x4\n"
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr s5, [x11], #0x4\n"
+      "ldr s4, [x10], #0x4\n"
+      "ldr s3, [x9], #0x4\n"
+      "ldr s2, [x28], #0x4\n"
+      "ldr q1, [x16, #0x0]\n"
+      ".inst 0x6f87e028  // udot v8.4s, v1.16b, v7.4b[0]\n"
+      "ldr q0, [x16, #0x10]\n"
+      ".inst 0x6f86e02c  // udot v12.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x6f85e030  // udot v16.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x6f84e034  // udot v20.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x6f83e038  // udot v24.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x6f82e03c  // udot v28.4s, v1.16b, v2.4b[0]\n"
+      "ldr q1, [x16, #0x20]\n"
+      ".inst 0x6f87e009  // udot v9.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x6f86e00d  // udot v13.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x6f85e011  // udot v17.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x6f84e015  // udot v21.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x6f83e019  // udot v25.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x6f82e01d  // udot v29.4s, v0.16b, v2.4b[0]\n"
+      "ldr q0, [x16, #0x30]\n"
+      ".inst 0x6f87e02a  // udot v10.4s, v1.16b, v7.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x6f86e02e  // udot v14.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x6f85e032  // udot v18.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x6f84e036  // udot v22.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x6f83e03a  // udot v26.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x6f82e03e  // udot v30.4s, v1.16b, v2.4b[0]\n"
+      ".inst 0x6f87e00b  // udot v11.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x6f86e00f  // udot v15.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x6f85e013  // udot v19.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x6f84e017  // udot v23.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x6f83e01b  // udot v27.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x6f82e01f  // udot v31.4s, v0.16b, v2.4b[0]\n"
       "bge 190b\n"
       "191:"  // Height 6: Multiply loop: Skip odd blocks
       "cbz x14, 194f\n"
       "tbz x14, #1, 192f\n"
       "ldr h0, [x13], #0x2\n"
-      "ldr h1, [x9], #0x2\n"
-      "ldr h2, [x27], #0x2\n"
-      "ldr h3, [x25], #0x2\n"
-      "ldr h4, [x23], #0x2\n"
-      "ldr h5, [x21], #0x2\n"
+      "ldr h1, [x12], #0x2\n"
+      "ldr h2, [x11], #0x2\n"
+      "ldr h3, [x10], #0x2\n"
+      "ldr h4, [x9], #0x2\n"
+      "ldr h5, [x28], #0x2\n"
       "tbz x14, #0, 193f\n"
       "ld1 { v0.b }[2], [x13]\n"
-      "ld1 { v1.b }[2], [x9]\n"
-      "ld1 { v2.b }[2], [x27]\n"
-      "ld1 { v3.b }[2], [x25]\n"
-      "ld1 { v4.b }[2], [x23]\n"
-      "ld1 { v5.b }[2], [x21]\n"
+      "ld1 { v1.b }[2], [x12]\n"
+      "ld1 { v2.b }[2], [x11]\n"
+      "ld1 { v3.b }[2], [x10]\n"
+      "ld1 { v4.b }[2], [x9]\n"
+      "ld1 { v5.b }[2], [x28]\n"
       "b 193f\n"
       "192:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x13, #0x0]\n"
-      "ldr b1, [x9, #0x0]\n"
-      "ldr b2, [x27, #0x0]\n"
-      "ldr b3, [x25, #0x0]\n"
-      "ldr b4, [x23, #0x0]\n"
-      "ldr b5, [x21, #0x0]\n"
+      "ldr b1, [x12, #0x0]\n"
+      "ldr b2, [x11, #0x0]\n"
+      "ldr b3, [x10, #0x0]\n"
+      "ldr b4, [x9, #0x0]\n"
+      "ldr b5, [x28, #0x0]\n"
       "193:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x16, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q7, [x16, #0x10]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x16, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x16, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x16, #0x0]\n"
+      ".inst 0x6f80e0e8  // udot v8.4s, v7.16b, v0.4b[0]\n"
+      "ldr q6, [x16, #0x10]\n"
+      ".inst 0x6f81e0ec  // udot v12.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f0  // udot v16.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f4  // udot v20.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f8  // udot v24.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fc  // udot v28.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x16, #0x20]\n"
+      ".inst 0x6f80e0c9  // udot v9.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0cd  // udot v13.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d1  // udot v17.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d5  // udot v21.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0d9  // udot v25.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0dd  // udot v29.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x16, #0x30]\n"
+      ".inst 0x6f80e0ea  // udot v10.4s, v7.16b, v0.4b[0]\n"
       "add x16, x16, #0x40\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x6f81e0ee  // udot v14.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f2  // udot v18.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f6  // udot v22.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fa  // udot v26.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fe  // udot v30.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x6f80e0cb  // udot v11.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0cf  // udot v15.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d3  // udot v19.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d7  // udot v23.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0db  // udot v27.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0df  // udot v31.4s, v6.16b, v5.4b[0]\n"
       "194:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x15, x15, #0x1\n"
@@ -3488,7 +3487,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "206:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
index 38131cf..849c680 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
@@ -77,7 +77,6 @@
     ka.N = N;
     ka.B_ptr = B_ptr;
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 171f\n"
@@ -165,11 +164,11 @@
       "14:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 15f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 16f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -186,37 +185,37 @@
       "blt 18f\n"
       "17:"  // Height 1: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "sub x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
       "cmp x27, #0x20\n"
       "add x10, x10, #0x100\n"
@@ -226,37 +225,37 @@
       "bge 17b\n"
       "18:"  // Height 1: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x26, x26, #0x10\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "add x10, x10, #0x100\n"
       "19:"  // Height 1: Multiply loop: Main loop skip
@@ -264,17 +263,17 @@
       "cmp x27, #0x4\n"
       "blt 21f\n"
       "20:"  // Height 1: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr s18, [x26], #0x4\n"
+      "ldr q16, [x10, #0x0]\n"
+      ".inst 0x6f92e208  // udot v8.4s, v16.16b, v18.4b[0]\n"
       "sub x27, x27, #0x4\n"
-      "ldr q7, [x10, #0x10]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x6f92e209  // udot v9.4s, v16.16b, v18.4b[0]\n"
       "cmp x27, #0x4\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x6f92e22a  // udot v10.4s, v17.16b, v18.4b[0]\n"
+      ".inst 0x6f92e20b  // udot v11.4s, v16.16b, v18.4b[0]\n"
       "add x10, x10, #0x40\n"
       "bge 20b\n"
       "21:"  // Height 1: Multiply loop: Skip odd blocks
@@ -287,14 +286,14 @@
       "22:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr b0, [x26, #0x0]\n"
       "23:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x6f80e228  // udot v8.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x6f80e209  // udot v9.4s, v16.16b, v0.4b[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
       "add x10, x10, #0x40\n"
       "24:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -465,12 +464,12 @@
       "48:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 49f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 50f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -478,7 +477,7 @@
       "b 50f\n"
       "49:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
+      "add x25, x26, x21\n"
       "50:"  // Height 2: input setup done
       "cmp x27, #0x10\n"
       "blt 53f\n"
@@ -491,137 +490,137 @@
       "51:"  // Height 2: Multiply loop: Main loop head
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       "sub x27, x27, #0x10\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q16, [x10, #0x30]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x6f81e22e  // udot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x6f81e20f  // udot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
       "cmp x27, #0x20\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e22c  // udot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e20d  // udot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e22e  // udot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e20f  // udot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea2c  // udot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea0d  // udot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea2e  // udot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea0f  // udot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea2c  // udot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea0d  // udot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea2e  // udot v14.4s, v17.16b, v1.4b[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa1ea0f  // udot v15.4s, v16.16b, v1.4b[3]\n"
       "ldr q1, [x25, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 51b\n"
       "52:"  // Height 2: Multiply loop: Single iteration only
       ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
       ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x20]\n"
       "add x26, x26, #0x10\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q16, [x10, #0x30]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x6f81e22e  // udot v14.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x40]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x6f81e20f  // udot v15.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x10, #0x50]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6fa0e228  // udot v8.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e22c  // udot v12.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x60]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x6fa0e209  // udot v9.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e20d  // udot v13.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x70]\n"
+      ".inst 0x6fa0e22a  // udot v10.4s, v17.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e22e  // udot v14.4s, v17.16b, v1.4b[1]\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x6fa0e20b  // udot v11.4s, v16.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e20f  // udot v15.4s, v16.16b, v1.4b[1]\n"
+      "ldr q16, [x10, #0x90]\n"
+      ".inst 0x6f80ea28  // udot v8.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea2c  // udot v12.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xa0]\n"
+      ".inst 0x6f80ea09  // udot v9.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea0d  // udot v13.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xb0]\n"
+      ".inst 0x6f80ea2a  // udot v10.4s, v17.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea2e  // udot v14.4s, v17.16b, v1.4b[2]\n"
+      "ldr q17, [x10, #0xc0]\n"
+      ".inst 0x6f80ea0b  // udot v11.4s, v16.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea0f  // udot v15.4s, v16.16b, v1.4b[2]\n"
+      "ldr q16, [x10, #0xd0]\n"
+      ".inst 0x6fa0ea28  // udot v8.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea2c  // udot v12.4s, v17.16b, v1.4b[3]\n"
+      "ldr q17, [x10, #0xe0]\n"
+      ".inst 0x6fa0ea09  // udot v9.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea0d  // udot v13.4s, v16.16b, v1.4b[3]\n"
+      "ldr q16, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa0ea2a  // udot v10.4s, v17.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea2e  // udot v14.4s, v17.16b, v1.4b[3]\n"
+      ".inst 0x6fa0ea0b  // udot v11.4s, v16.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea0f  // udot v15.4s, v16.16b, v1.4b[3]\n"
       "53:"  // Height 2: Multiply loop: Main loop skip
       "cbz x27, 58f\n"
       "cmp x27, #0x4\n"
       "blt 55f\n"
       "54:"  // Height 2: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s19, [x26], #0x4\n"
+      "ldr s18, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x6f93e228  // udot v8.4s, v17.16b, v19.4b[0]\n"
+      ".inst 0x6f92e22c  // udot v12.4s, v17.16b, v18.4b[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x6f93e209  // udot v9.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x6f92e20d  // udot v13.4s, v16.16b, v18.4b[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x6f93e22a  // udot v10.4s, v17.16b, v19.4b[0]\n"
+      ".inst 0x6f92e22e  // udot v14.4s, v17.16b, v18.4b[0]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f93e20b  // udot v11.4s, v16.16b, v19.4b[0]\n"
+      ".inst 0x6f92e20f  // udot v15.4s, v16.16b, v18.4b[0]\n"
       "bge 54b\n"
       "55:"  // Height 2: Multiply loop: Skip odd blocks
       "cbz x27, 58f\n"
@@ -636,19 +635,19 @@
       "ldr b0, [x26, #0x0]\n"
       "ldr b1, [x25, #0x0]\n"
       "57:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q16, [x10, #0x10]\n"
+      ".inst 0x6f80e228  // udot v8.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x6f81e22c  // udot v12.4s, v17.16b, v1.4b[0]\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x6f80e209  // udot v9.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x6f81e20d  // udot v13.4s, v16.16b, v1.4b[0]\n"
+      "ldr q16, [x10, #0x30]\n"
+      ".inst 0x6f80e22a  // udot v10.4s, v17.16b, v0.4b[0]\n"
+      ".inst 0x6f81e22e  // udot v14.4s, v17.16b, v1.4b[0]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f80e20b  // udot v11.4s, v16.16b, v0.4b[0]\n"
+      ".inst 0x6f81e20f  // udot v15.4s, v16.16b, v1.4b[0]\n"
       "58:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -866,13 +865,13 @@
       "82:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 83f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 84f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -881,8 +880,8 @@
       "b 84f\n"
       "83:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "84:"  // Height 3: input setup done
       "cmp x27, #0x10\n"
       "blt 87f\n"
@@ -899,75 +898,75 @@
       "sub x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q21, [x10, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
       "add x25, x25, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f80e2aa  // udot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f81e2ae  // udot v14.4s, v21.16b, v1.4b[0]\n"
       "cmp x27, #0x20\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f82e2b2  // udot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      ".inst 0x6f80e28b  // udot v11.4s, v20.16b, v0.4b[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6f81e28f  // udot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x6f82e293  // udot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x10, #0x50]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x6fa0e2a8  // udot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ac  // udot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e2b0  // udot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      ".inst 0x6fa0e289  // udot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e28d  // udot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e291  // udot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      ".inst 0x6fa0e2aa  // udot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ae  // udot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e2b2  // udot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      ".inst 0x6fa0e28b  // udot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e28f  // udot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e293  // udot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      ".inst 0x6f80eaa8  // udot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaac  // udot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x6f82eab0  // udot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      ".inst 0x6f80ea89  // udot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea8d  // udot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x6f82ea91  // udot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      ".inst 0x6f80eaaa  // udot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaae  // udot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x6f82eab2  // udot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      ".inst 0x6f80ea8b  // udot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea8f  // udot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x6f82ea93  // udot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      ".inst 0x6fa0eaa8  // udot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eaac  // udot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eab0  // udot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      ".inst 0x6fa0ea89  // udot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea8d  // udot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ea91  // udot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa0eaaa  // udot v10.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eaae  // udot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eab2  // udot v18.4s, v21.16b, v2.4b[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa0ea8b  // udot v11.4s, v20.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa1ea8f  // udot v15.4s, v20.16b, v1.4b[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa2ea93  // udot v19.4s, v20.16b, v2.4b[3]\n"
       "ldr q2, [x24, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 85b\n"
@@ -977,98 +976,98 @@
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q21, [x10, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
       "add x24, x24, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q20, [x10, #0x30]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f80e2aa  // udot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f81e2ae  // udot v14.4s, v21.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f82e2b2  // udot v18.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x10, #0x40]\n"
+      ".inst 0x6f80e28b  // udot v11.4s, v20.16b, v0.4b[0]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x6f81e28f  // udot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x6f82e293  // udot v19.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x10, #0x50]\n"
+      ".inst 0x6fa0e2a8  // udot v8.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ac  // udot v12.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e2b0  // udot v16.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x60]\n"
+      ".inst 0x6fa0e289  // udot v9.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e28d  // udot v13.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e291  // udot v17.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x70]\n"
+      ".inst 0x6fa0e2aa  // udot v10.4s, v21.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e2ae  // udot v14.4s, v21.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e2b2  // udot v18.4s, v21.16b, v2.4b[1]\n"
+      "ldr q21, [x10, #0x80]\n"
+      ".inst 0x6fa0e28b  // udot v11.4s, v20.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e28f  // udot v15.4s, v20.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e293  // udot v19.4s, v20.16b, v2.4b[1]\n"
+      "ldr q20, [x10, #0x90]\n"
+      ".inst 0x6f80eaa8  // udot v8.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaac  // udot v12.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x6f82eab0  // udot v16.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xa0]\n"
+      ".inst 0x6f80ea89  // udot v9.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea8d  // udot v13.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x6f82ea91  // udot v17.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xb0]\n"
+      ".inst 0x6f80eaaa  // udot v10.4s, v21.16b, v0.4b[2]\n"
+      ".inst 0x6f81eaae  // udot v14.4s, v21.16b, v1.4b[2]\n"
+      ".inst 0x6f82eab2  // udot v18.4s, v21.16b, v2.4b[2]\n"
+      "ldr q21, [x10, #0xc0]\n"
+      ".inst 0x6f80ea8b  // udot v11.4s, v20.16b, v0.4b[2]\n"
+      ".inst 0x6f81ea8f  // udot v15.4s, v20.16b, v1.4b[2]\n"
+      ".inst 0x6f82ea93  // udot v19.4s, v20.16b, v2.4b[2]\n"
+      "ldr q20, [x10, #0xd0]\n"
+      ".inst 0x6fa0eaa8  // udot v8.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eaac  // udot v12.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eab0  // udot v16.4s, v21.16b, v2.4b[3]\n"
+      "ldr q21, [x10, #0xe0]\n"
+      ".inst 0x6fa0ea89  // udot v9.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea8d  // udot v13.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ea91  // udot v17.4s, v20.16b, v2.4b[3]\n"
+      "ldr q20, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa0eaaa  // udot v10.4s, v21.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eaae  // udot v14.4s, v21.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eab2  // udot v18.4s, v21.16b, v2.4b[3]\n"
+      ".inst 0x6fa0ea8b  // udot v11.4s, v20.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ea8f  // udot v15.4s, v20.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ea93  // udot v19.4s, v20.16b, v2.4b[3]\n"
       "87:"  // Height 3: Multiply loop: Main loop skip
       "cbz x27, 92f\n"
       "cmp x27, #0x4\n"
       "blt 89f\n"
       "88:"  // Height 3: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr q21, [x10, #0x0]\n"
+      ".inst 0x6f98e2a8  // udot v8.4s, v21.16b, v24.4b[0]\n"
+      ".inst 0x6f97e2ac  // udot v12.4s, v21.16b, v23.4b[0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      ".inst 0x6f96e2b0  // udot v16.4s, v21.16b, v22.4b[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      ".inst 0x6f98e289  // udot v9.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x6f97e28d  // udot v13.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x6f96e291  // udot v17.4s, v20.16b, v22.4b[0]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f98e2aa  // udot v10.4s, v21.16b, v24.4b[0]\n"
+      ".inst 0x6f97e2ae  // udot v14.4s, v21.16b, v23.4b[0]\n"
+      ".inst 0x6f96e2b2  // udot v18.4s, v21.16b, v22.4b[0]\n"
+      ".inst 0x6f98e28b  // udot v11.4s, v20.16b, v24.4b[0]\n"
+      ".inst 0x6f97e28f  // udot v15.4s, v20.16b, v23.4b[0]\n"
+      ".inst 0x6f96e293  // udot v19.4s, v20.16b, v22.4b[0]\n"
       "bge 88b\n"
       "89:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x27, 92f\n"
@@ -1086,23 +1085,23 @@
       "ldr b1, [x25, #0x0]\n"
       "ldr b2, [x24, #0x0]\n"
       "91:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q21, [x10, #0x0]\n"
+      "ldr q20, [x10, #0x10]\n"
+      ".inst 0x6f80e2a8  // udot v8.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f81e2ac  // udot v12.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x6f82e2b0  // udot v16.4s, v21.16b, v2.4b[0]\n"
+      "ldr q21, [x10, #0x20]\n"
+      ".inst 0x6f80e289  // udot v9.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x6f81e28d  // udot v13.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x6f82e291  // udot v17.4s, v20.16b, v2.4b[0]\n"
+      "ldr q20, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f80e2aa  // udot v10.4s, v21.16b, v0.4b[0]\n"
+      ".inst 0x6f81e2ae  // udot v14.4s, v21.16b, v1.4b[0]\n"
+      ".inst 0x6f82e2b2  // udot v18.4s, v21.16b, v2.4b[0]\n"
+      ".inst 0x6f80e28b  // udot v11.4s, v20.16b, v0.4b[0]\n"
+      ".inst 0x6f81e28f  // udot v15.4s, v20.16b, v1.4b[0]\n"
+      ".inst 0x6f82e293  // udot v19.4s, v20.16b, v2.4b[0]\n"
       "92:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1367,14 +1366,14 @@
       "116:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 117f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 118f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1384,9 +1383,9 @@
       "b 118f\n"
       "117:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "118:"  // Height 4: input setup done
       "cmp x27, #0x10\n"
       "blt 121f\n"
@@ -1405,7 +1404,7 @@
       "add x26, x26, #0x10\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x20]\n"
       "add x25, x25, #0x10\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1413,85 +1412,85 @@
       "add x23, x23, #0x10\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q24, [x10, #0x30]\n"
       "cmp x27, #0x20\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f80e32a  // udot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e32e  // udot v14.4s, v25.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6f82e332  // udot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x6f83e336  // udot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x10, #0x40]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f80e30b  // udot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e30f  // udot v15.4s, v24.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x6f82e313  // udot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x6f83e317  // udot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      ".inst 0x6fa0e328  // udot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32c  // udot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e330  // udot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e334  // udot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      ".inst 0x6fa0e309  // udot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e30d  // udot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e311  // udot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e315  // udot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      ".inst 0x6fa0e32a  // udot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32e  // udot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e332  // udot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e336  // udot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      ".inst 0x6fa0e30b  // udot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e30f  // udot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e313  // udot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e317  // udot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      ".inst 0x6f80eb28  // udot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2c  // udot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb30  // udot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb34  // udot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      ".inst 0x6f80eb09  // udot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb0d  // udot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb11  // udot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb15  // udot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      ".inst 0x6f80eb2a  // udot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2e  // udot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb32  // udot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb36  // udot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      ".inst 0x6f80eb0b  // udot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb0f  // udot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb13  // udot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb17  // udot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      ".inst 0x6fa0eb28  // udot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb2c  // udot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb30  // udot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb34  // udot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      ".inst 0x6fa0eb09  // udot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb0d  // udot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb11  // udot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb15  // udot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa0eb2a  // udot v10.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb2e  // udot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb32  // udot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb36  // udot v22.4s, v25.16b, v3.4b[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eb0b  // udot v11.4s, v24.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa1eb0f  // udot v15.4s, v24.16b, v1.4b[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa2eb13  // udot v19.4s, v24.16b, v2.4b[3]\n"
       "ldr q2, [x24, #0x0]\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa3eb17  // udot v23.4s, v24.16b, v3.4b[3]\n"
       "ldr q3, [x23, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 119b\n"
@@ -1502,7 +1501,7 @@
       "add x25, x25, #0x10\n"
       ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
       ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q25, [x10, #0x20]\n"
       "add x24, x24, #0x10\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
@@ -1510,112 +1509,112 @@
       "sub x27, x27, #0x10\n"
       ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q24, [x10, #0x30]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f80e32a  // udot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e32e  // udot v14.4s, v25.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6f82e332  // udot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x6f83e336  // udot v22.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x10, #0x40]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x6f80e30b  // udot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e30f  // udot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x6f82e313  // udot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x6f83e317  // udot v23.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x10, #0x50]\n"
+      ".inst 0x6fa0e328  // udot v8.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32c  // udot v12.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e330  // udot v16.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e334  // udot v20.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x60]\n"
+      ".inst 0x6fa0e309  // udot v9.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e30d  // udot v13.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e311  // udot v17.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e315  // udot v21.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x70]\n"
+      ".inst 0x6fa0e32a  // udot v10.4s, v25.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e32e  // udot v14.4s, v25.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e332  // udot v18.4s, v25.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e336  // udot v22.4s, v25.16b, v3.4b[1]\n"
+      "ldr q25, [x10, #0x80]\n"
+      ".inst 0x6fa0e30b  // udot v11.4s, v24.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e30f  // udot v15.4s, v24.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e313  // udot v19.4s, v24.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e317  // udot v23.4s, v24.16b, v3.4b[1]\n"
+      "ldr q24, [x10, #0x90]\n"
+      ".inst 0x6f80eb28  // udot v8.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2c  // udot v12.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb30  // udot v16.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb34  // udot v20.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xa0]\n"
+      ".inst 0x6f80eb09  // udot v9.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb0d  // udot v13.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb11  // udot v17.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb15  // udot v21.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xb0]\n"
+      ".inst 0x6f80eb2a  // udot v10.4s, v25.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb2e  // udot v14.4s, v25.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb32  // udot v18.4s, v25.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb36  // udot v22.4s, v25.16b, v3.4b[2]\n"
+      "ldr q25, [x10, #0xc0]\n"
+      ".inst 0x6f80eb0b  // udot v11.4s, v24.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb0f  // udot v15.4s, v24.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb13  // udot v19.4s, v24.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb17  // udot v23.4s, v24.16b, v3.4b[2]\n"
+      "ldr q24, [x10, #0xd0]\n"
+      ".inst 0x6fa0eb28  // udot v8.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb2c  // udot v12.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb30  // udot v16.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb34  // udot v20.4s, v25.16b, v3.4b[3]\n"
+      "ldr q25, [x10, #0xe0]\n"
+      ".inst 0x6fa0eb09  // udot v9.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb0d  // udot v13.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb11  // udot v17.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb15  // udot v21.4s, v24.16b, v3.4b[3]\n"
+      "ldr q24, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa0eb2a  // udot v10.4s, v25.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb2e  // udot v14.4s, v25.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb32  // udot v18.4s, v25.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb36  // udot v22.4s, v25.16b, v3.4b[3]\n"
+      ".inst 0x6fa0eb0b  // udot v11.4s, v24.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb0f  // udot v15.4s, v24.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb13  // udot v19.4s, v24.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb17  // udot v23.4s, v24.16b, v3.4b[3]\n"
       "121:"  // Height 4: Multiply loop: Main loop skip
       "cbz x27, 126f\n"
       "cmp x27, #0x4\n"
       "blt 123f\n"
       "122:"  // Height 4: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      ".inst 0x6f9de328  // udot v8.4s, v25.16b, v29.4b[0]\n"
+      ".inst 0x6f9ce32c  // udot v12.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x6f9be330  // udot v16.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae334  // udot v20.4s, v25.16b, v26.4b[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      ".inst 0x6f9de309  // udot v9.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x6f9ce30d  // udot v13.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x6f9be311  // udot v17.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae315  // udot v21.4s, v24.16b, v26.4b[0]\n"
+      "ldr q24, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f9de32a  // udot v10.4s, v25.16b, v29.4b[0]\n"
+      ".inst 0x6f9ce32e  // udot v14.4s, v25.16b, v28.4b[0]\n"
+      ".inst 0x6f9be332  // udot v18.4s, v25.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae336  // udot v22.4s, v25.16b, v26.4b[0]\n"
+      ".inst 0x6f9de30b  // udot v11.4s, v24.16b, v29.4b[0]\n"
+      ".inst 0x6f9ce30f  // udot v15.4s, v24.16b, v28.4b[0]\n"
+      ".inst 0x6f9be313  // udot v19.4s, v24.16b, v27.4b[0]\n"
+      ".inst 0x6f9ae317  // udot v23.4s, v24.16b, v26.4b[0]\n"
       "bge 122b\n"
       "123:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x27, 126f\n"
@@ -1636,27 +1635,27 @@
       "ldr b2, [x24, #0x0]\n"
       "ldr b3, [x23, #0x0]\n"
       "125:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q25, [x10, #0x0]\n"
+      "ldr q24, [x10, #0x10]\n"
+      ".inst 0x6f80e328  // udot v8.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e32c  // udot v12.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x6f82e330  // udot v16.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x6f83e334  // udot v20.4s, v25.16b, v3.4b[0]\n"
+      "ldr q25, [x10, #0x20]\n"
+      ".inst 0x6f80e309  // udot v9.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e30d  // udot v13.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x6f82e311  // udot v17.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x6f83e315  // udot v21.4s, v24.16b, v3.4b[0]\n"
+      "ldr q24, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f80e32a  // udot v10.4s, v25.16b, v0.4b[0]\n"
+      ".inst 0x6f81e32e  // udot v14.4s, v25.16b, v1.4b[0]\n"
+      ".inst 0x6f82e332  // udot v18.4s, v25.16b, v2.4b[0]\n"
+      ".inst 0x6f83e336  // udot v22.4s, v25.16b, v3.4b[0]\n"
+      ".inst 0x6f80e30b  // udot v11.4s, v24.16b, v0.4b[0]\n"
+      ".inst 0x6f81e30f  // udot v15.4s, v24.16b, v1.4b[0]\n"
+      ".inst 0x6f82e313  // udot v19.4s, v24.16b, v2.4b[0]\n"
+      ".inst 0x6f83e317  // udot v23.4s, v24.16b, v3.4b[0]\n"
       "126:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1968,15 +1967,15 @@
       "150:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 151f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 152f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1987,10 +1986,10 @@
       "b 152f\n"
       "151:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "152:"  // Height 5: input setup done
       "cmp x27, #0x10\n"
       "blt 155f\n"
@@ -2013,7 +2012,7 @@
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q29, [x10, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
       "add x23, x23, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2022,100 +2021,100 @@
       "cmp x27, #0x20\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q28, [x10, #0x30]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f80e3aa  // udot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f82e3b2  // udot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f83e3b6  // udot v22.4s, v29.16b, v3.4b[0]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x6f84e3ba  // udot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      ".inst 0x6f80e38b  // udot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e393  // udot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f83e397  // udot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x6f84e39b  // udot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      ".inst 0x6fa0e3a8  // udot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ac  // udot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3b0  // udot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e3b4  // udot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e3b8  // udot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      ".inst 0x6fa0e389  // udot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e38d  // udot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e391  // udot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e395  // udot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e399  // udot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      ".inst 0x6fa0e3aa  // udot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ae  // udot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3b2  // udot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e3b6  // udot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e3ba  // udot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      ".inst 0x6fa0e38b  // udot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e38f  // udot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e393  // udot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e397  // udot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e39b  // udot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      ".inst 0x6f80eba8  // udot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebac  // udot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f82ebb0  // udot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x6f83ebb4  // udot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebb8  // udot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      ".inst 0x6f80eb89  // udot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb8d  // udot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb91  // udot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb95  // udot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb99  // udot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      ".inst 0x6f80ebaa  // udot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebae  // udot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f82ebb2  // udot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x6f83ebb6  // udot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebba  // udot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      ".inst 0x6f80eb8b  // udot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb8f  // udot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb93  // udot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb97  // udot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb9b  // udot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      ".inst 0x6fa0eba8  // udot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebac  // udot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebb0  // udot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb4  // udot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebb8  // udot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      ".inst 0x6fa0eb89  // udot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb8d  // udot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb91  // udot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb95  // udot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x6fa4eb99  // udot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa0ebaa  // udot v10.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebae  // udot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebb2  // udot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb6  // udot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebba  // udot v26.4s, v29.16b, v4.4b[3]\n"
       "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa0eb8b  // udot v11.4s, v28.16b, v0.4b[3]\n"
       "ldr q0, [x26, #0x0]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa1eb8f  // udot v15.4s, v28.16b, v1.4b[3]\n"
       "ldr q1, [x25, #0x0]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa2eb93  // udot v19.4s, v28.16b, v2.4b[3]\n"
       "ldr q2, [x24, #0x0]\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa3eb97  // udot v23.4s, v28.16b, v3.4b[3]\n"
       "ldr q3, [x23, #0x0]\n"
-      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x6fa4eb9b  // udot v27.4s, v28.16b, v4.4b[3]\n"
       "ldr q4, [x22, #0x0]\n"
       "ldr q7, [x10, #0x10]\n"
       "bge 153b\n"
@@ -2129,7 +2128,7 @@
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
+      "ldr q29, [x10, #0x20]\n"
       ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
       "add x22, x22, #0x10\n"
       ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
@@ -2138,131 +2137,131 @@
       "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
       ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q28, [x10, #0x30]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f80e3aa  // udot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f82e3b2  // udot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f83e3b6  // udot v22.4s, v29.16b, v3.4b[0]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
-      "ldr q6, [x10, #0x80]\n"
-      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
-      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
-      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
-      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
-      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
-      "ldr q7, [x10, #0x90]\n"
-      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x10, #0xa0]\n"
-      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x10, #0xb0]\n"
-      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
-      "ldr q6, [x10, #0xc0]\n"
-      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
-      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
-      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
-      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
-      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
-      "ldr q7, [x10, #0xd0]\n"
-      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
-      "ldr q6, [x10, #0xe0]\n"
-      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
-      "ldr q7, [x10, #0xf0]\n"
+      ".inst 0x6f84e3ba  // udot v26.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x10, #0x40]\n"
+      ".inst 0x6f80e38b  // udot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e393  // udot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f83e397  // udot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x6f84e39b  // udot v27.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x10, #0x50]\n"
+      ".inst 0x6fa0e3a8  // udot v8.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ac  // udot v12.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3b0  // udot v16.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e3b4  // udot v20.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e3b8  // udot v24.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x60]\n"
+      ".inst 0x6fa0e389  // udot v9.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e38d  // udot v13.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e391  // udot v17.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e395  // udot v21.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e399  // udot v25.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x70]\n"
+      ".inst 0x6fa0e3aa  // udot v10.4s, v29.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e3ae  // udot v14.4s, v29.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e3b2  // udot v18.4s, v29.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e3b6  // udot v22.4s, v29.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e3ba  // udot v26.4s, v29.16b, v4.4b[1]\n"
+      "ldr q29, [x10, #0x80]\n"
+      ".inst 0x6fa0e38b  // udot v11.4s, v28.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e38f  // udot v15.4s, v28.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e393  // udot v19.4s, v28.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e397  // udot v23.4s, v28.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e39b  // udot v27.4s, v28.16b, v4.4b[1]\n"
+      "ldr q28, [x10, #0x90]\n"
+      ".inst 0x6f80eba8  // udot v8.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebac  // udot v12.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f82ebb0  // udot v16.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x6f83ebb4  // udot v20.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebb8  // udot v24.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xa0]\n"
+      ".inst 0x6f80eb89  // udot v9.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb8d  // udot v13.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb91  // udot v17.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb95  // udot v21.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb99  // udot v25.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xb0]\n"
+      ".inst 0x6f80ebaa  // udot v10.4s, v29.16b, v0.4b[2]\n"
+      ".inst 0x6f81ebae  // udot v14.4s, v29.16b, v1.4b[2]\n"
+      ".inst 0x6f82ebb2  // udot v18.4s, v29.16b, v2.4b[2]\n"
+      ".inst 0x6f83ebb6  // udot v22.4s, v29.16b, v3.4b[2]\n"
+      ".inst 0x6f84ebba  // udot v26.4s, v29.16b, v4.4b[2]\n"
+      "ldr q29, [x10, #0xc0]\n"
+      ".inst 0x6f80eb8b  // udot v11.4s, v28.16b, v0.4b[2]\n"
+      ".inst 0x6f81eb8f  // udot v15.4s, v28.16b, v1.4b[2]\n"
+      ".inst 0x6f82eb93  // udot v19.4s, v28.16b, v2.4b[2]\n"
+      ".inst 0x6f83eb97  // udot v23.4s, v28.16b, v3.4b[2]\n"
+      ".inst 0x6f84eb9b  // udot v27.4s, v28.16b, v4.4b[2]\n"
+      "ldr q28, [x10, #0xd0]\n"
+      ".inst 0x6fa0eba8  // udot v8.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebac  // udot v12.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebb0  // udot v16.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb4  // udot v20.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebb8  // udot v24.4s, v29.16b, v4.4b[3]\n"
+      "ldr q29, [x10, #0xe0]\n"
+      ".inst 0x6fa0eb89  // udot v9.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb8d  // udot v13.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb91  // udot v17.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb95  // udot v21.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x6fa4eb99  // udot v25.4s, v28.16b, v4.4b[3]\n"
+      "ldr q28, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
-      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
-      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
-      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
-      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
-      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x6fa0ebaa  // udot v10.4s, v29.16b, v0.4b[3]\n"
+      ".inst 0x6fa1ebae  // udot v14.4s, v29.16b, v1.4b[3]\n"
+      ".inst 0x6fa2ebb2  // udot v18.4s, v29.16b, v2.4b[3]\n"
+      ".inst 0x6fa3ebb6  // udot v22.4s, v29.16b, v3.4b[3]\n"
+      ".inst 0x6fa4ebba  // udot v26.4s, v29.16b, v4.4b[3]\n"
+      ".inst 0x6fa0eb8b  // udot v11.4s, v28.16b, v0.4b[3]\n"
+      ".inst 0x6fa1eb8f  // udot v15.4s, v28.16b, v1.4b[3]\n"
+      ".inst 0x6fa2eb93  // udot v19.4s, v28.16b, v2.4b[3]\n"
+      ".inst 0x6fa3eb97  // udot v23.4s, v28.16b, v3.4b[3]\n"
+      ".inst 0x6fa4eb9b  // udot v27.4s, v28.16b, v4.4b[3]\n"
       "155:"  // Height 5: Multiply loop: Main loop skip
       "cbz x27, 160f\n"
       "cmp x27, #0x4\n"
       "blt 157f\n"
       "156:"  // Height 5: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
       "ldr s1, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s0, [x24], #0x4\n"
+      "ldr s31, [x23], #0x4\n"
+      "ldr s30, [x22], #0x4\n"
+      "ldr q29, [x10, #0x0]\n"
+      ".inst 0x6f82e3a8  // udot v8.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f81e3ac  // udot v12.4s, v29.16b, v1.4b[0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      ".inst 0x6f80e3b0  // udot v16.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe3b4  // udot v20.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee3b8  // udot v24.4s, v29.16b, v30.4b[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      ".inst 0x6f82e389  // udot v9.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f81e38d  // udot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f80e391  // udot v17.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe395  // udot v21.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee399  // udot v25.4s, v28.16b, v30.4b[0]\n"
+      "ldr q28, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f82e3aa  // udot v10.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f80e3b2  // udot v18.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe3b6  // udot v22.4s, v29.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee3ba  // udot v26.4s, v29.16b, v30.4b[0]\n"
+      ".inst 0x6f82e38b  // udot v11.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f80e393  // udot v19.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f9fe397  // udot v23.4s, v28.16b, v31.4b[0]\n"
+      ".inst 0x6f9ee39b  // udot v27.4s, v28.16b, v30.4b[0]\n"
       "bge 156b\n"
       "157:"  // Height 5: Multiply loop: Skip odd blocks
       "cbz x27, 160f\n"
@@ -2286,31 +2285,31 @@
       "ldr b3, [x23, #0x0]\n"
       "ldr b4, [x22, #0x0]\n"
       "159:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q29, [x10, #0x0]\n"
+      "ldr q28, [x10, #0x10]\n"
+      ".inst 0x6f80e3a8  // udot v8.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3ac  // udot v12.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3b0  // udot v16.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f83e3b4  // udot v20.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x6f84e3b8  // udot v24.4s, v29.16b, v4.4b[0]\n"
+      "ldr q29, [x10, #0x20]\n"
+      ".inst 0x6f80e389  // udot v9.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e38d  // udot v13.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e391  // udot v17.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f83e395  // udot v21.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x6f84e399  // udot v25.4s, v28.16b, v4.4b[0]\n"
+      "ldr q28, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f80e3aa  // udot v10.4s, v29.16b, v0.4b[0]\n"
+      ".inst 0x6f81e3ae  // udot v14.4s, v29.16b, v1.4b[0]\n"
+      ".inst 0x6f82e3b2  // udot v18.4s, v29.16b, v2.4b[0]\n"
+      ".inst 0x6f83e3b6  // udot v22.4s, v29.16b, v3.4b[0]\n"
+      ".inst 0x6f84e3ba  // udot v26.4s, v29.16b, v4.4b[0]\n"
+      ".inst 0x6f80e38b  // udot v11.4s, v28.16b, v0.4b[0]\n"
+      ".inst 0x6f81e38f  // udot v15.4s, v28.16b, v1.4b[0]\n"
+      ".inst 0x6f82e393  // udot v19.4s, v28.16b, v2.4b[0]\n"
+      ".inst 0x6f83e397  // udot v23.4s, v28.16b, v3.4b[0]\n"
+      ".inst 0x6f84e39b  // udot v27.4s, v28.16b, v4.4b[0]\n"
       "160:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -2672,16 +2671,16 @@
       "184:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 185f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 186f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -2693,11 +2692,11 @@
       "b 186f\n"
       "185:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "186:"  // Height 6: input setup done
       "cmp x27, #0x10\n"
       "blt 189f\n"
@@ -2976,43 +2975,43 @@
       "cmp x27, #0x4\n"
       "blt 191f\n"
       "190:"  // Height 6: Multiply loop: Odd block loop
-      "ldr s0, [x26], #0x4\n"
-      "ldr s1, [x25], #0x4\n"
+      "ldr s7, [x26], #0x4\n"
+      "ldr s6, [x25], #0x4\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "ldr s2, [x24], #0x4\n"
-      "ldr s3, [x23], #0x4\n"
-      "ldr s4, [x22], #0x4\n"
-      "ldr s5, [x21], #0x4\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s4, [x23], #0x4\n"
+      "ldr s3, [x22], #0x4\n"
+      "ldr s2, [x21], #0x4\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6f87e028  // udot v8.4s, v1.16b, v7.4b[0]\n"
+      ".inst 0x6f86e02c  // udot v12.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x6f85e030  // udot v16.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x6f84e034  // udot v20.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x6f83e038  // udot v24.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x6f82e03c  // udot v28.4s, v1.16b, v2.4b[0]\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x6f87e009  // udot v9.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x6f86e00d  // udot v13.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x6f85e011  // udot v17.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x6f84e015  // udot v21.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x6f83e019  // udot v25.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x6f82e01d  // udot v29.4s, v0.16b, v2.4b[0]\n"
+      "ldr q0, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x6f87e02a  // udot v10.4s, v1.16b, v7.4b[0]\n"
+      ".inst 0x6f86e02e  // udot v14.4s, v1.16b, v6.4b[0]\n"
+      ".inst 0x6f85e032  // udot v18.4s, v1.16b, v5.4b[0]\n"
+      ".inst 0x6f84e036  // udot v22.4s, v1.16b, v4.4b[0]\n"
+      ".inst 0x6f83e03a  // udot v26.4s, v1.16b, v3.4b[0]\n"
+      ".inst 0x6f82e03e  // udot v30.4s, v1.16b, v2.4b[0]\n"
+      ".inst 0x6f87e00b  // udot v11.4s, v0.16b, v7.4b[0]\n"
+      ".inst 0x6f86e00f  // udot v15.4s, v0.16b, v6.4b[0]\n"
+      ".inst 0x6f85e013  // udot v19.4s, v0.16b, v5.4b[0]\n"
+      ".inst 0x6f84e017  // udot v23.4s, v0.16b, v4.4b[0]\n"
+      ".inst 0x6f83e01b  // udot v27.4s, v0.16b, v3.4b[0]\n"
+      ".inst 0x6f82e01f  // udot v31.4s, v0.16b, v2.4b[0]\n"
       "bge 190b\n"
       "191:"  // Height 6: Multiply loop: Skip odd blocks
       "cbz x27, 194f\n"
@@ -3039,35 +3038,35 @@
       "ldr b4, [x22, #0x0]\n"
       "ldr b5, [x21, #0x0]\n"
       "193:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
-      "ldr q7, [x10, #0x30]\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6f80e0e8  // udot v8.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ec  // udot v12.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f0  // udot v16.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f4  // udot v20.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f8  // udot v24.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fc  // udot v28.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6f80e0c9  // udot v9.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0cd  // udot v13.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d1  // udot v17.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d5  // udot v21.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0d9  // udot v25.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0dd  // udot v29.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x10, #0x30]\n"
       "add x10, x10, #0x40\n"
-      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
-      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
-      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
-      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
-      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
-      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
-      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x6f80e0ea  // udot v10.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ee  // udot v14.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f2  // udot v18.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f6  // udot v22.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fa  // udot v26.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fe  // udot v30.4s, v7.16b, v5.4b[0]\n"
+      ".inst 0x6f80e0cb  // udot v11.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0cf  // udot v15.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d3  // udot v19.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d7  // udot v23.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0db  // udot v27.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0df  // udot v31.4s, v6.16b, v5.4b[0]\n"
       "194:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -3254,7 +3253,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "206:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
index b5cedc7..e360452 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -109,5 +109,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp
index dd0c46e..364f388 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp
@@ -77,7 +77,6 @@
     ka.N = N;
     ka.B_ptr = B_ptr;
     __asm__ __volatile__(
-
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 186f\n"
@@ -178,11 +177,11 @@
       "15:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 16f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 17f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -198,41 +197,41 @@
       "ldr q6, [x10, #0x10]\n"
       "blt 19f\n"
       "18:"  // Height 1: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      "trn1 v19.2d, v1.2d, v20.2d\n"
+      ".inst 0x6e87a668  // ummla v8.4s, v19.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e86a66c  // ummla v12.4s, v19.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e92a669  // ummla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e91a66d  // ummla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e92a66a  // ummla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e91a66e  // ummla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v20.2d\n"
+      ".inst 0x6e92a66b  // ummla v11.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x6e91a66f  // ummla v15.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x6e92a428  // ummla v8.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e91a42c  // ummla v12.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e92a429  // ummla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e91a42d  // ummla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e92a42a  // ummla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e91a42e  // ummla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
       "sub x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
       "cmp x27, #0x20\n"
-      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e92a42b  // ummla v11.4s, v1.16b, v18.16b\n"
+      ".inst 0x6e91a42f  // ummla v15.4s, v1.16b, v17.16b\n"
       "ldr q1, [x26, #0x0]\n"
       "add x10, x10, #0x100\n"
       "ldr q7, [x10, #0x0]\n"
@@ -240,40 +239,40 @@
       "prfm pldl1keep, [x26, #0x80]\n"
       "bge 18b\n"
       "19:"  // Height 1: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      "trn1 v20.2d, v1.2d, v21.2d\n"
+      ".inst 0x6e87a688  // ummla v8.4s, v20.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e86a68c  // ummla v12.4s, v20.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e92a689  // ummla v9.4s, v20.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e91a68d  // ummla v13.4s, v20.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e92a68a  // ummla v10.4s, v20.16b, v18.16b\n"
+      "ldr q19, [x10, #0x60]\n"
+      ".inst 0x6e91a68e  // ummla v14.4s, v20.16b, v17.16b\n"
+      "ldr q18, [x10, #0x70]\n"
+      "trn2 v1.2d, v1.2d, v21.2d\n"
+      ".inst 0x6e93a68b  // ummla v11.4s, v20.16b, v19.16b\n"
+      "ldr q17, [x10, #0x80]\n"
+      ".inst 0x6e92a68f  // ummla v15.4s, v20.16b, v18.16b\n"
+      "ldr q19, [x10, #0x90]\n"
+      ".inst 0x6e91a428  // ummla v8.4s, v1.16b, v17.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e93a42c  // ummla v12.4s, v1.16b, v19.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e92a429  // ummla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e91a42d  // ummla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e92a42a  // ummla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e91a42e  // ummla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
       "add x26, x26, #0x10\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e92a42b  // ummla v11.4s, v1.16b, v18.16b\n"
+      ".inst 0x6e91a42f  // ummla v15.4s, v1.16b, v17.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "add x10, x10, #0x100\n"
       "20:"  // Height 1: Multiply loop: Main loop skip
@@ -281,26 +280,26 @@
       "cmp x27, #0x8\n"
       "blt 22f\n"
       "21:"  // Height 1: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr q6, [x10, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e87a40e  // ummla v14.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x70]\n"
+      "ldr d19, [x26], #0x8\n"
+      "ldr q18, [x10, #0x0]\n"
+      "trn1 v19.2d, v19.2d, v17.2d\n"
+      "ldr q17, [x10, #0x10]\n"
+      ".inst 0x6e92a668  // ummla v8.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e91a66c  // ummla v12.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e92a669  // ummla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e91a66d  // ummla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e92a66a  // ummla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e91a66e  // ummla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
       "sub x27, x27, #0x8\n"
       "cmp x27, #0x8\n"
-      ".inst 0x6e86a40b  // ummla v11.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e87a40f  // ummla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e92a66b  // ummla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x6e91a66f  // ummla v15.4s, v19.16b, v17.16b\n"
       "add x10, x10, #0x80\n"
       "bge 21b\n"
       "22:"  // Height 1: Multiply loop: Skip odd blocks
@@ -325,23 +324,23 @@
       "25:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
       "ldr b1, [x26, #0x0]\n"
       "26:"  // Height 1: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      "ldr q23, [x10, #0x0]\n"
+      "ldr q18, [x10, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v17.2d\n"
+      ".inst 0x6e97a668  // ummla v8.4s, v19.16b, v23.16b\n"
+      "ldr q17, [x10, #0x20]\n"
+      ".inst 0x6e92a66c  // ummla v12.4s, v19.16b, v18.16b\n"
+      "ldr q31, [x10, #0x30]\n"
+      ".inst 0x6e91a669  // ummla v9.4s, v19.16b, v17.16b\n"
+      "ldr q20, [x10, #0x40]\n"
+      ".inst 0x6e9fa66d  // ummla v13.4s, v19.16b, v31.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e94a66a  // ummla v10.4s, v19.16b, v20.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e91a66e  // ummla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x6e92a66b  // ummla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x6e91a66f  // ummla v15.4s, v19.16b, v17.16b\n"
       "add x10, x10, #0x80\n"
       "27:"  // Height 1: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -525,12 +524,12 @@
       "52:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 53f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 54f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -538,7 +537,7 @@
       "b 54f\n"
       "53:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
+      "add x25, x26, x21\n"
       "54:"  // Height 2: input setup done
       "cmp x27, #0x10\n"
       "blt 57f\n"
@@ -549,85 +548,85 @@
       "ldr q6, [x10, #0x10]\n"
       "blt 56f\n"
       "55:"  // Height 2: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a668  // ummla v8.4s, v19.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e86a66c  // ummla v12.4s, v19.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e92a669  // ummla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e91a66d  // ummla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e92a66a  // ummla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e91a66e  // ummla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x6e92a66b  // ummla v11.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x6e91a66f  // ummla v15.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x6e92a428  // ummla v8.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e91a42c  // ummla v12.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e92a429  // ummla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e91a42d  // ummla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e92a42a  // ummla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e91a42e  // ummla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
       "sub x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "ldr q2, [x25, #0x0]\n"
       "cmp x27, #0x20\n"
-      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e92a42b  // ummla v11.4s, v1.16b, v18.16b\n"
       "add x10, x10, #0x100\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e91a42f  // ummla v15.4s, v1.16b, v17.16b\n"
       "ldr q1, [x26, #0x0]\n"
       "ldr q6, [x10, #0x10]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "bge 55b\n"
       "56:"  // Height 2: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a668  // ummla v8.4s, v19.16b, v7.16b\n"
+      "ldr q18, [x10, #0x20]\n"
+      ".inst 0x6e86a66c  // ummla v12.4s, v19.16b, v6.16b\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e92a669  // ummla v9.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e91a66d  // ummla v13.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e92a66a  // ummla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e91a66e  // ummla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x6e92a66b  // ummla v11.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x80]\n"
+      ".inst 0x6e91a66f  // ummla v15.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x90]\n"
+      ".inst 0x6e92a428  // ummla v8.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xa0]\n"
+      ".inst 0x6e91a42c  // ummla v12.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xb0]\n"
+      ".inst 0x6e92a429  // ummla v9.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xc0]\n"
+      ".inst 0x6e91a42d  // ummla v13.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xd0]\n"
+      ".inst 0x6e92a42a  // ummla v10.4s, v1.16b, v18.16b\n"
+      "ldr q18, [x10, #0xe0]\n"
+      ".inst 0x6e91a42e  // ummla v14.4s, v1.16b, v17.16b\n"
+      "ldr q17, [x10, #0xf0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e92a42b  // ummla v11.4s, v1.16b, v18.16b\n"
+      ".inst 0x6e91a42f  // ummla v15.4s, v1.16b, v17.16b\n"
       "sub x27, x27, #0x10\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
@@ -637,27 +636,27 @@
       "cmp x27, #0x8\n"
       "blt 59f\n"
       "58:"  // Height 2: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d18, [x26], #0x8\n"
+      "ldr d17, [x25], #0x8\n"
+      "trn1 v19.2d, v18.2d, v17.2d\n"
       "sub x27, x27, #0x8\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
-      "ldr q6, [x10, #0x20]\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
-      "ldr q6, [x10, #0x40]\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e87a40e  // ummla v14.4s, v0.16b, v7.16b\n"
-      "ldr q6, [x10, #0x60]\n"
-      "ldr q7, [x10, #0x70]\n"
+      "ldr q17, [x10, #0x0]\n"
+      "ldr q22, [x10, #0x10]\n"
+      ".inst 0x6e91a668  // ummla v8.4s, v19.16b, v17.16b\n"
+      ".inst 0x6e96a66c  // ummla v12.4s, v19.16b, v22.16b\n"
+      "ldr q1, [x10, #0x20]\n"
+      "ldr q17, [x10, #0x30]\n"
+      ".inst 0x6e81a669  // ummla v9.4s, v19.16b, v1.16b\n"
+      ".inst 0x6e91a66d  // ummla v13.4s, v19.16b, v17.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e92a66a  // ummla v10.4s, v19.16b, v18.16b\n"
+      ".inst 0x6e91a66e  // ummla v14.4s, v19.16b, v17.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      "ldr q17, [x10, #0x70]\n"
       "cmp x27, #0x8\n"
-      ".inst 0x6e86a40b  // ummla v11.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e87a40f  // ummla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e92a66b  // ummla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x6e91a66f  // ummla v15.4s, v19.16b, v17.16b\n"
       "add x10, x10, #0x80\n"
       "bge 58b\n"
       "59:"  // Height 2: Multiply loop: Skip odd blocks
@@ -689,23 +688,23 @@
       "ldr b1, [x26, #0x0]\n"
       "ldr b2, [x25, #0x0]\n"
       "63:"  // Height 2: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      "ldr q18, [x10, #0x0]\n"
+      "ldr q17, [x10, #0x10]\n"
+      "trn1 v19.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e92a668  // ummla v8.4s, v19.16b, v18.16b\n"
+      "ldr q5, [x10, #0x20]\n"
+      ".inst 0x6e91a66c  // ummla v12.4s, v19.16b, v17.16b\n"
+      "ldr q21, [x10, #0x30]\n"
+      ".inst 0x6e85a669  // ummla v9.4s, v19.16b, v5.16b\n"
+      "ldr q18, [x10, #0x40]\n"
+      ".inst 0x6e95a66d  // ummla v13.4s, v19.16b, v21.16b\n"
+      "ldr q17, [x10, #0x50]\n"
+      ".inst 0x6e92a66a  // ummla v10.4s, v19.16b, v18.16b\n"
+      "ldr q18, [x10, #0x60]\n"
+      ".inst 0x6e91a66e  // ummla v14.4s, v19.16b, v17.16b\n"
+      "ldr q17, [x10, #0x70]\n"
+      ".inst 0x6e92a66b  // ummla v11.4s, v19.16b, v18.16b\n"
+      ".inst 0x6e91a66f  // ummla v15.4s, v19.16b, v17.16b\n"
       "add x10, x10, #0x80\n"
       "64:"  // Height 2: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -953,13 +952,13 @@
       "89:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 90f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 91f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -968,8 +967,8 @@
       "b 91f\n"
       "90:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "91:"  // Height 3: input setup done
       "cmp x27, #0x10\n"
       "blt 94f\n"
@@ -981,167 +980,167 @@
       "ldr q6, [x10, #0x10]\n"
       "blt 93f\n"
       "92:"  // Height 3: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
-      "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e87a788  // ummla v8.4s, v28.16b, v7.16b\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e87a770  // ummla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e86a78c  // ummla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x6e86a774  // ummla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
       "cmp x27, #0x20\n"
-      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e9aa428  // ummla v8.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa470  // ummla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e99a42c  // ummla v12.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e99a474  // ummla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e9aa429  // ummla v9.4s, v1.16b, v26.16b\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e9aa471  // ummla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e99a42d  // ummla v13.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x6e99a475  // ummla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e9aa42a  // ummla v10.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa472  // ummla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e99a42e  // ummla v14.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e99a476  // ummla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e9aa42b  // ummla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa473  // ummla v19.4s, v3.16b, v26.16b\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e99a42f  // ummla v15.4s, v1.16b, v25.16b\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e99a477  // ummla v23.4s, v3.16b, v25.16b\n"
       "ldr q3, [x24, #0x0]\n"
       "ldr q6, [x10, #0x10]\n"
       "bge 92b\n"
       "93:"  // Height 3: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
-      "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e87a788  // ummla v8.4s, v28.16b, v7.16b\n"
+      "trn1 v27.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e87a770  // ummla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e86a78c  // ummla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x6e86a774  // ummla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
+      "trn2 v3.2d, v3.2d, v29.2d\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
+      ".inst 0x6e9aa428  // ummla v8.4s, v1.16b, v26.16b\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e9aa470  // ummla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e99a42c  // ummla v12.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x6e99a474  // ummla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e9aa429  // ummla v9.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa471  // ummla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e99a42d  // ummla v13.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e99a475  // ummla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e9aa42a  // ummla v10.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa472  // ummla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e99a42e  // ummla v14.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e99a476  // ummla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e9aa42b  // ummla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa473  // ummla v19.4s, v3.16b, v26.16b\n"
+      ".inst 0x6e99a42f  // ummla v15.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e99a477  // ummla v23.4s, v3.16b, v25.16b\n"
       "94:"  // Height 3: Multiply loop: Main loop skip
       "cbz x27, 101f\n"
       "cmp x27, #0x8\n"
       "blt 96f\n"
       "95:"  // Height 3: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr q6, [x10, #0x0]\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e86a450  // ummla v16.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a454  // ummla v20.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr q26, [x10, #0x0]\n"
+      "trn1 v27.2d, v25.2d, v27.2d\n"
+      ".inst 0x6e9aa788  // ummla v8.4s, v28.16b, v26.16b\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e9aa770  // ummla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e99a78c  // ummla v12.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a774  // ummla v20.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6e86a451  // ummla v17.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
       "cmp x27, #0x8\n"
-      ".inst 0x6e87a455  // ummla v21.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a452  // ummla v18.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e87a40e  // ummla v14.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a456  // ummla v22.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6e86a40b  // ummla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e86a453  // ummla v19.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a40f  // ummla v15.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
       "bge 95b\n"
       "96:"  // Height 3: Multiply loop: Skip odd blocks
       "cbz x27, 101f\n"
@@ -1179,33 +1178,33 @@
       "ldr b2, [x25, #0x0]\n"
       "ldr b3, [x24, #0x0]\n"
       "100:"  // Height 3: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q29, [x10, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v25.2d\n"
+      ".inst 0x6e9aa788  // ummla v8.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa770  // ummla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e9da78c  // ummla v12.4s, v28.16b, v29.16b\n"
+      ".inst 0x6e9da774  // ummla v20.4s, v27.16b, v29.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
       "101:"  // Height 3: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1499,14 +1498,14 @@
       "126:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 127f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 128f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1516,9 +1515,9 @@
       "b 128f\n"
       "127:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "128:"  // Height 4: input setup done
       "cmp x27, #0x10\n"
       "blt 131f\n"
@@ -1531,173 +1530,173 @@
       "ldr q6, [x10, #0x10]\n"
       "blt 130f\n"
       "129:"  // Height 4: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a788  // ummla v8.4s, v28.16b, v7.16b\n"
       "sub x27, x27, #0x10\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e87a770  // ummla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e86a78c  // ummla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x6e86a774  // ummla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
       "add x23, x23, #0x10\n"
       "ldr q4, [x23, #0x0]\n"
-      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e9aa428  // ummla v8.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa470  // ummla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e99a42c  // ummla v12.4s, v1.16b, v25.16b\n"
       "cmp x27, #0x20\n"
-      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e99a474  // ummla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e9aa429  // ummla v9.4s, v1.16b, v26.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e9aa471  // ummla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e99a42d  // ummla v13.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e99a475  // ummla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e9aa42a  // ummla v10.4s, v1.16b, v26.16b\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e9aa472  // ummla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e99a42e  // ummla v14.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x6e99a476  // ummla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e9aa42b  // ummla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa473  // ummla v19.4s, v3.16b, v26.16b\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e99a42f  // ummla v15.4s, v1.16b, v25.16b\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e99a477  // ummla v23.4s, v3.16b, v25.16b\n"
       "ldr q3, [x24, #0x0]\n"
       "ldr q6, [x10, #0x10]\n"
       "bge 129b\n"
       "130:"  // Height 4: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a788  // ummla v8.4s, v28.16b, v7.16b\n"
       "add x26, x26, #0x10\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e87a770  // ummla v16.4s, v27.16b, v7.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e86a78c  // ummla v12.4s, v28.16b, v6.16b\n"
+      ".inst 0x6e86a774  // ummla v20.4s, v27.16b, v6.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x80]\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
-      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x90]\n"
+      ".inst 0x6e9aa428  // ummla v8.4s, v1.16b, v26.16b\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e9aa470  // ummla v16.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xa0]\n"
+      ".inst 0x6e99a42c  // ummla v12.4s, v1.16b, v25.16b\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e99a474  // ummla v20.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xb0]\n"
+      ".inst 0x6e9aa429  // ummla v9.4s, v1.16b, v26.16b\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      ".inst 0x6e9aa471  // ummla v17.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xc0]\n"
+      ".inst 0x6e99a42d  // ummla v13.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e99a475  // ummla v21.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xd0]\n"
+      ".inst 0x6e9aa42a  // ummla v10.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa472  // ummla v18.4s, v3.16b, v26.16b\n"
+      "ldr q26, [x10, #0xe0]\n"
+      ".inst 0x6e99a42e  // ummla v14.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e99a476  // ummla v22.4s, v3.16b, v25.16b\n"
+      "ldr q25, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e9aa42b  // ummla v11.4s, v1.16b, v26.16b\n"
+      ".inst 0x6e9aa473  // ummla v19.4s, v3.16b, v26.16b\n"
+      ".inst 0x6e99a42f  // ummla v15.4s, v1.16b, v25.16b\n"
+      ".inst 0x6e99a477  // ummla v23.4s, v3.16b, v25.16b\n"
       "131:"  // Height 4: Multiply loop: Main loop skip
       "cbz x27, 138f\n"
       "cmp x27, #0x8\n"
       "blt 133f\n"
       "132:"  // Height 4: Multiply loop: Odd block loop
-      "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d26, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "trn1 v28.2d, v26.2d, v25.2d\n"
       "sub x27, x27, #0x8\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d26, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "trn1 v27.2d, v26.2d, v25.2d\n"
       "cmp x27, #0x8\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a450  // ummla v16.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a454  // ummla v20.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a451  // ummla v17.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a455  // ummla v21.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a452  // ummla v18.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e87a40e  // ummla v14.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a456  // ummla v22.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x70]\n"
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
+      ".inst 0x6e9aa788  // ummla v8.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa770  // ummla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e99a78c  // ummla v12.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a774  // ummla v20.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e86a40b  // ummla v11.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a453  // ummla v19.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a40f  // ummla v15.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
       "bge 132b\n"
       "133:"  // Height 4: Multiply loop: Skip odd blocks
       "cbz x27, 138f\n"
@@ -1742,33 +1741,33 @@
       "ldr b3, [x24, #0x0]\n"
       "ldr b4, [x23, #0x0]\n"
       "137:"  // Height 4: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "ldr q6, [x10, #0x10]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
+      "ldr q26, [x10, #0x0]\n"
+      "ldr q25, [x10, #0x10]\n"
+      "trn1 v28.2d, v1.2d, v2.2d\n"
+      "trn1 v27.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e9aa788  // ummla v8.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa770  // ummla v16.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x20]\n"
+      ".inst 0x6e99a78c  // ummla v12.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a774  // ummla v20.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x30]\n"
+      ".inst 0x6e9aa789  // ummla v9.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa771  // ummla v17.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x40]\n"
+      ".inst 0x6e99a78d  // ummla v13.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a775  // ummla v21.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x50]\n"
+      ".inst 0x6e9aa78a  // ummla v10.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa772  // ummla v18.4s, v27.16b, v26.16b\n"
+      "ldr q26, [x10, #0x60]\n"
+      ".inst 0x6e99a78e  // ummla v14.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a776  // ummla v22.4s, v27.16b, v25.16b\n"
+      "ldr q25, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e9aa78b  // ummla v11.4s, v28.16b, v26.16b\n"
+      ".inst 0x6e9aa773  // ummla v19.4s, v27.16b, v26.16b\n"
+      ".inst 0x6e99a78f  // ummla v15.4s, v28.16b, v25.16b\n"
+      ".inst 0x6e99a777  // ummla v23.4s, v27.16b, v25.16b\n"
       "138:"  // Height 4: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -2125,15 +2124,15 @@
       "163:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 164f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 165f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -2144,10 +2143,10 @@
       "b 165f\n"
       "164:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "165:"  // Height 5: input setup done
       "cmp x27, #0x10\n"
       "blt 168f\n"
@@ -2160,174 +2159,174 @@
       "ldr q7, [x10, #0x0]\n"
       "blt 167f\n"
       "166:"  // Height 5: Multiply loop: Main loop head
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v6.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a4c8  // ummla v8.4s, v6.16b, v7.16b\n"
       "trn1 v2.2d, v3.2d, v4.2d\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
       ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
       "sub x27, x27, #0x10\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "trn2 v5.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x10, #0x10]\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x10, #0x10]\n"
       ".inst 0x6e87a498  // ummla v24.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e80a4cc  // ummla v12.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a454  // ummla v20.4s, v2.16b, v0.16b\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6e86a49c  // ummla v28.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e80a49c  // ummla v28.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e87a4c9  // ummla v9.4s, v6.16b, v7.16b\n"
       "add x25, x25, #0x10\n"
       ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
       ".inst 0x6e87a499  // ummla v25.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x40]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e80a4cd  // ummla v13.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a455  // ummla v21.4s, v2.16b, v0.16b\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x6e86a49d  // ummla v29.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e80a49d  // ummla v29.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e87a4ca  // ummla v10.4s, v6.16b, v7.16b\n"
       "cmp x27, #0x20\n"
       ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
       ".inst 0x6e87a49a  // ummla v26.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x60]\n"
       "prfm pldl1keep, [x26, #0x80]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e80a4ce  // ummla v14.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a456  // ummla v22.4s, v2.16b, v0.16b\n"
       "prfm pldl1keep, [x25, #0x80]\n"
       "prfm pldl1keep, [x24, #0x80]\n"
-      ".inst 0x6e86a49e  // ummla v30.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e80a49e  // ummla v30.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x6e87a4cb  // ummla v11.4s, v6.16b, v7.16b\n"
       "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
       ".inst 0x6e87a49b  // ummla v27.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x80]\n"
       "prfm pldl1keep, [x22, #0x80]\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e80a4cf  // ummla v15.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a457  // ummla v23.4s, v2.16b, v0.16b\n"
       "ldr q2, [x25, #0x0]\n"
-      ".inst 0x6e86a49f  // ummla v31.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e80a49f  // ummla v31.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x90]\n"
       "ldr q4, [x23, #0x0]\n"
       ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
       ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
       ".inst 0x6e87a4b8  // ummla v24.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e86a4bc  // ummla v28.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e87a4b9  // ummla v25.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e86a4bd  // ummla v29.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e87a4ba  // ummla v26.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e86a4be  // ummla v30.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      "ldr q6, [x10, #0xa0]\n"
+      ".inst 0x6e80a42c  // ummla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a474  // ummla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4bc  // ummla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xb0]\n"
+      ".inst 0x6e86a429  // ummla v9.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a471  // ummla v17.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4b9  // ummla v25.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xc0]\n"
+      ".inst 0x6e80a42d  // ummla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a475  // ummla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4bd  // ummla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xd0]\n"
+      ".inst 0x6e86a42a  // ummla v10.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a472  // ummla v18.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4ba  // ummla v26.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xe0]\n"
+      ".inst 0x6e80a42e  // ummla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a476  // ummla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4be  // ummla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e87a4bb  // ummla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x6e86a42b  // ummla v11.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a473  // ummla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bb  // ummla v27.4s, v5.16b, v6.16b\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e80a42f  // ummla v15.4s, v1.16b, v0.16b\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e80a477  // ummla v23.4s, v3.16b, v0.16b\n"
       "ldr q3, [x24, #0x0]\n"
-      ".inst 0x6e86a4bf  // ummla v31.4s, v5.16b, v6.16b\n"
+      ".inst 0x6e80a4bf  // ummla v31.4s, v5.16b, v0.16b\n"
       "ldr q5, [x22, #0x0]\n"
       "bge 166b\n"
       "167:"  // Height 5: Multiply loop: Single iteration only
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v6.2d, v1.2d, v2.2d\n"
       "trn2 v1.2d, v1.2d, v2.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a4c8  // ummla v8.4s, v6.16b, v7.16b\n"
       "trn1 v2.2d, v3.2d, v4.2d\n"
       "trn2 v3.2d, v3.2d, v4.2d\n"
       ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
       "add x26, x26, #0x10\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "trn2 v5.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x10, #0x10]\n"
+      "trn1 v4.2d, v5.2d, v0.2d\n"
+      "trn2 v5.2d, v5.2d, v0.2d\n"
+      "ldr q0, [x10, #0x10]\n"
       ".inst 0x6e87a498  // ummla v24.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e80a4cc  // ummla v12.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a454  // ummla v20.4s, v2.16b, v0.16b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6e86a49c  // ummla v28.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e80a49c  // ummla v28.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e87a4c9  // ummla v9.4s, v6.16b, v7.16b\n"
       "add x24, x24, #0x10\n"
       ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
       ".inst 0x6e87a499  // ummla v25.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x40]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e80a4cd  // ummla v13.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a455  // ummla v21.4s, v2.16b, v0.16b\n"
       "add x22, x22, #0x10\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x6e86a49d  // ummla v29.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e80a49d  // ummla v29.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e87a4ca  // ummla v10.4s, v6.16b, v7.16b\n"
       "prfm pldl1keep, [x26, #0x80]\n"
       ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
       ".inst 0x6e87a49a  // ummla v26.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x60]\n"
       "prfm pldl1keep, [x25, #0x80]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e80a4ce  // ummla v14.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a456  // ummla v22.4s, v2.16b, v0.16b\n"
       "prfm pldl1keep, [x24, #0x80]\n"
       "prfm pldl1keep, [x23, #0x80]\n"
-      ".inst 0x6e86a49e  // ummla v30.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e80a49e  // ummla v30.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x6e87a4cb  // ummla v11.4s, v6.16b, v7.16b\n"
       "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
       ".inst 0x6e87a49b  // ummla v27.4s, v4.16b, v7.16b\n"
       "ldr q7, [x10, #0x80]\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a49f  // ummla v31.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e80a4cf  // ummla v15.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e80a457  // ummla v23.4s, v2.16b, v0.16b\n"
+      ".inst 0x6e80a49f  // ummla v31.4s, v4.16b, v0.16b\n"
+      "ldr q2, [x10, #0x90]\n"
       ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
       ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
       ".inst 0x6e87a4b8  // ummla v24.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e86a4bc  // ummla v28.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e87a4b9  // ummla v25.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e86a4bd  // ummla v29.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e87a4ba  // ummla v26.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e86a4be  // ummla v30.4s, v5.16b, v6.16b\n"
+      "ldr q0, [x10, #0xa0]\n"
+      ".inst 0x6e82a42c  // ummla v12.4s, v1.16b, v2.16b\n"
+      ".inst 0x6e82a474  // ummla v20.4s, v3.16b, v2.16b\n"
+      ".inst 0x6e82a4bc  // ummla v28.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xb0]\n"
+      ".inst 0x6e80a429  // ummla v9.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a471  // ummla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4b9  // ummla v25.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xc0]\n"
+      ".inst 0x6e82a42d  // ummla v13.4s, v1.16b, v2.16b\n"
+      ".inst 0x6e82a475  // ummla v21.4s, v3.16b, v2.16b\n"
+      ".inst 0x6e82a4bd  // ummla v29.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xd0]\n"
+      ".inst 0x6e80a42a  // ummla v10.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a472  // ummla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4ba  // ummla v26.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xe0]\n"
+      ".inst 0x6e82a42e  // ummla v14.4s, v1.16b, v2.16b\n"
+      ".inst 0x6e82a476  // ummla v22.4s, v3.16b, v2.16b\n"
+      ".inst 0x6e82a4be  // ummla v30.4s, v5.16b, v2.16b\n"
       "ldr q6, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e87a4bb  // ummla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x6e80a42b  // ummla v11.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a473  // ummla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4bb  // ummla v27.4s, v5.16b, v0.16b\n"
       ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
       ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
       ".inst 0x6e86a4bf  // ummla v31.4s, v5.16b, v6.16b\n"
@@ -2337,48 +2336,48 @@
       "blt 170f\n"
       "169:"  // Height 5: Multiply loop: Odd block loop
       "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
       "sub x27, x27, #0x8\n"
-      "ldr d5, [x22], #0x8\n"
-      "ldr q6, [x10, #0x0]\n"
-      "trn1 v4.2d, v5.2d, v7.2d\n"
-      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e86a450  // ummla v16.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a498  // ummla v24.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a454  // ummla v20.4s, v2.16b, v7.16b\n"
+      "ldr d0, [x22], #0x8\n"
+      "ldr q1, [x10, #0x0]\n"
+      "trn1 v2.2d, v0.2d, v2.2d\n"
+      ".inst 0x6e81a488  // ummla v8.4s, v4.16b, v1.16b\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e81a470  // ummla v16.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a458  // ummla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x6e80a48c  // ummla v12.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a474  // ummla v20.4s, v3.16b, v0.16b\n"
       "cmp x27, #0x8\n"
-      ".inst 0x6e87a49c  // ummla v28.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a451  // ummla v17.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a499  // ummla v25.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a455  // ummla v21.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a49d  // ummla v29.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a452  // ummla v18.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a49a  // ummla v26.4s, v4.16b, v6.16b\n"
+      ".inst 0x6e80a45c  // ummla v28.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e81a489  // ummla v9.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e81a471  // ummla v17.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a459  // ummla v25.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x40]\n"
+      ".inst 0x6e80a48d  // ummla v13.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a475  // ummla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45d  // ummla v29.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e81a48a  // ummla v10.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e81a472  // ummla v18.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45a  // ummla v26.4s, v2.16b, v1.16b\n"
       "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e87a40e  // ummla v14.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a456  // ummla v22.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a49e  // ummla v30.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x70]\n"
-      ".inst 0x6e86a40b  // ummla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e80a48e  // ummla v14.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a476  // ummla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45e  // ummla v30.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
+      ".inst 0x6e86a48b  // ummla v11.4s, v4.16b, v6.16b\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e86a453  // ummla v19.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a49b  // ummla v27.4s, v4.16b, v6.16b\n"
-      ".inst 0x6e87a40f  // ummla v15.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a49f  // ummla v31.4s, v4.16b, v7.16b\n"
+      ".inst 0x6e86a473  // ummla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a45b  // ummla v27.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e80a48f  // ummla v15.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a477  // ummla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45f  // ummla v31.4s, v2.16b, v0.16b\n"
       "bge 169b\n"
       "170:"  // Height 5: Multiply loop: Skip odd blocks
       "cbz x27, 175f\n"
@@ -2430,42 +2429,42 @@
       "ldr b4, [x23, #0x0]\n"
       "ldr b5, [x22, #0x0]\n"
       "174:"  // Height 5: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a498  // ummla v24.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a49c  // ummla v28.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a499  // ummla v25.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a49d  // ummla v29.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a49a  // ummla v26.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a49e  // ummla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x10, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      "trn1 v2.2d, v5.2d, v0.2d\n"
+      "ldr q1, [x10, #0x10]\n"
+      ".inst 0x6e86a4e8  // ummla v8.4s, v7.16b, v6.16b\n"
+      ".inst 0x6e86a470  // ummla v16.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a458  // ummla v24.4s, v2.16b, v6.16b\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x6e81a4ec  // ummla v12.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e81a474  // ummla v20.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45c  // ummla v28.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x6e80a4e9  // ummla v9.4s, v7.16b, v0.16b\n"
+      ".inst 0x6e80a471  // ummla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a459  // ummla v25.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x40]\n"
+      ".inst 0x6e81a4ed  // ummla v13.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e81a475  // ummla v21.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45d  // ummla v29.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e80a4ea  // ummla v10.4s, v7.16b, v0.16b\n"
+      ".inst 0x6e80a472  // ummla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45a  // ummla v26.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x60]\n"
+      ".inst 0x6e81a4ee  // ummla v14.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e81a476  // ummla v22.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45e  // ummla v30.4s, v2.16b, v1.16b\n"
       "ldr q6, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a49b  // ummla v27.4s, v4.16b, v7.16b\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a49f  // ummla v31.4s, v4.16b, v6.16b\n"
+      ".inst 0x6e80a4eb  // ummla v11.4s, v7.16b, v0.16b\n"
+      ".inst 0x6e80a473  // ummla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45b  // ummla v27.4s, v2.16b, v0.16b\n"
+      ".inst 0x6e86a4ef  // ummla v15.4s, v7.16b, v6.16b\n"
+      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a45f  // ummla v31.4s, v2.16b, v6.16b\n"
       "175:"  // Height 5: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -2872,16 +2871,16 @@
       "200:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 201f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 202f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -2893,11 +2892,11 @@
       "b 202f\n"
       "201:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "202:"  // Height 6: input setup done
       "cmp x27, #0x10\n"
       "blt 205f\n"
@@ -2964,42 +2963,42 @@
       "ldr q2, [x25, #0x0]\n"
       "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x6e86a49f  // ummla v31.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
+      "ldr q0, [x10, #0x90]\n"
       "ldr q4, [x23, #0x0]\n"
       ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
       ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
       ".inst 0x6e87a4b8  // ummla v24.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e86a4bc  // ummla v28.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e87a4b9  // ummla v25.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e86a4bd  // ummla v29.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e87a4ba  // ummla v26.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e86a4be  // ummla v30.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xf0]\n"
+      "ldr q6, [x10, #0xa0]\n"
+      ".inst 0x6e80a42c  // ummla v12.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a474  // ummla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4bc  // ummla v28.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xb0]\n"
+      ".inst 0x6e86a429  // ummla v9.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a471  // ummla v17.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4b9  // ummla v25.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xc0]\n"
+      ".inst 0x6e80a42d  // ummla v13.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a475  // ummla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4bd  // ummla v29.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xd0]\n"
+      ".inst 0x6e86a42a  // ummla v10.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a472  // ummla v18.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4ba  // ummla v26.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x10, #0xe0]\n"
+      ".inst 0x6e80a42e  // ummla v14.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a476  // ummla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4be  // ummla v30.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e87a4bb  // ummla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x6e86a42b  // ummla v11.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a473  // ummla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bb  // ummla v27.4s, v5.16b, v6.16b\n"
       "ldr q7, [x10, #0x0]\n"
-      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e80a42f  // ummla v15.4s, v1.16b, v0.16b\n"
       "ldr q1, [x26, #0x0]\n"
-      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e80a477  // ummla v23.4s, v3.16b, v0.16b\n"
       "ldr q3, [x24, #0x0]\n"
-      ".inst 0x6e86a4bf  // ummla v31.4s, v5.16b, v6.16b\n"
+      ".inst 0x6e80a4bf  // ummla v31.4s, v5.16b, v0.16b\n"
       "ldr q5, [x22, #0x0]\n"
       "ldr q6, [x21, #0x0]\n"
       "bge 203b\n"
@@ -3055,35 +3054,35 @@
       ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
       ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
       ".inst 0x6e86a49f  // ummla v31.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x90]\n"
+      "ldr q2, [x10, #0x90]\n"
       ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
       ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
       ".inst 0x6e87a4b8  // ummla v24.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xa0]\n"
-      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e86a4bc  // ummla v28.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xb0]\n"
-      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e87a4b9  // ummla v25.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xc0]\n"
-      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e86a4bd  // ummla v29.4s, v5.16b, v6.16b\n"
-      "ldr q6, [x10, #0xd0]\n"
-      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e87a4ba  // ummla v26.4s, v5.16b, v7.16b\n"
-      "ldr q7, [x10, #0xe0]\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e86a4be  // ummla v30.4s, v5.16b, v6.16b\n"
+      "ldr q0, [x10, #0xa0]\n"
+      ".inst 0x6e82a42c  // ummla v12.4s, v1.16b, v2.16b\n"
+      ".inst 0x6e82a474  // ummla v20.4s, v3.16b, v2.16b\n"
+      ".inst 0x6e82a4bc  // ummla v28.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xb0]\n"
+      ".inst 0x6e80a429  // ummla v9.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a471  // ummla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4b9  // ummla v25.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xc0]\n"
+      ".inst 0x6e82a42d  // ummla v13.4s, v1.16b, v2.16b\n"
+      ".inst 0x6e82a475  // ummla v21.4s, v3.16b, v2.16b\n"
+      ".inst 0x6e82a4bd  // ummla v29.4s, v5.16b, v2.16b\n"
+      "ldr q2, [x10, #0xd0]\n"
+      ".inst 0x6e80a42a  // ummla v10.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a472  // ummla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4ba  // ummla v26.4s, v5.16b, v0.16b\n"
+      "ldr q0, [x10, #0xe0]\n"
+      ".inst 0x6e82a42e  // ummla v14.4s, v1.16b, v2.16b\n"
+      ".inst 0x6e82a476  // ummla v22.4s, v3.16b, v2.16b\n"
+      ".inst 0x6e82a4be  // ummla v30.4s, v5.16b, v2.16b\n"
       "ldr q6, [x10, #0xf0]\n"
       "add x10, x10, #0x100\n"
-      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e87a4bb  // ummla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x6e80a42b  // ummla v11.4s, v1.16b, v0.16b\n"
+      ".inst 0x6e80a473  // ummla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a4bb  // ummla v27.4s, v5.16b, v0.16b\n"
       ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
       ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
       ".inst 0x6e86a4bf  // ummla v31.4s, v5.16b, v6.16b\n"
@@ -3093,49 +3092,49 @@
       "blt 207f\n"
       "206:"  // Height 6: Multiply loop: Odd block loop
       "ldr d1, [x26], #0x8\n"
-      "ldr d2, [x25], #0x8\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d0, [x25], #0x8\n"
+      "trn1 v4.2d, v1.2d, v0.2d\n"
       "sub x27, x27, #0x8\n"
-      "ldr d3, [x24], #0x8\n"
-      "ldr d4, [x23], #0x8\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d0, [x23], #0x8\n"
+      "trn1 v3.2d, v1.2d, v0.2d\n"
       "cmp x27, #0x8\n"
-      "ldr d5, [x22], #0x8\n"
-      "ldr d7, [x21], #0x8\n"
-      "trn1 v4.2d, v5.2d, v7.2d\n"
-      "ldr q6, [x10, #0x0]\n"
-      "ldr q7, [x10, #0x10]\n"
-      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a450  // ummla v16.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a498  // ummla v24.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x20]\n"
-      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a454  // ummla v20.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a49c  // ummla v28.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x30]\n"
-      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a451  // ummla v17.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a499  // ummla v25.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x40]\n"
-      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a455  // ummla v21.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a49d  // ummla v29.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x50]\n"
-      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a452  // ummla v18.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a49a  // ummla v26.4s, v4.16b, v6.16b\n"
+      "ldr d1, [x22], #0x8\n"
+      "ldr d0, [x21], #0x8\n"
+      "trn1 v2.2d, v1.2d, v0.2d\n"
+      "ldr q1, [x10, #0x0]\n"
+      "ldr q0, [x10, #0x10]\n"
+      ".inst 0x6e81a488  // ummla v8.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e81a470  // ummla v16.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a458  // ummla v24.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x20]\n"
+      ".inst 0x6e80a48c  // ummla v12.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a474  // ummla v20.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45c  // ummla v28.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x30]\n"
+      ".inst 0x6e81a489  // ummla v9.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e81a471  // ummla v17.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a459  // ummla v25.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x40]\n"
+      ".inst 0x6e80a48d  // ummla v13.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a475  // ummla v21.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45d  // ummla v29.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x50]\n"
+      ".inst 0x6e81a48a  // ummla v10.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e81a472  // ummla v18.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45a  // ummla v26.4s, v2.16b, v1.16b\n"
       "ldr q6, [x10, #0x60]\n"
-      ".inst 0x6e87a40e  // ummla v14.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a456  // ummla v22.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a49e  // ummla v30.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x70]\n"
+      ".inst 0x6e80a48e  // ummla v14.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a476  // ummla v22.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45e  // ummla v30.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x70]\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e86a40b  // ummla v11.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a453  // ummla v19.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a49b  // ummla v27.4s, v4.16b, v6.16b\n"
-      ".inst 0x6e87a40f  // ummla v15.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a49f  // ummla v31.4s, v4.16b, v7.16b\n"
+      ".inst 0x6e86a48b  // ummla v11.4s, v4.16b, v6.16b\n"
+      ".inst 0x6e86a473  // ummla v19.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a45b  // ummla v27.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e80a48f  // ummla v15.4s, v4.16b, v0.16b\n"
+      ".inst 0x6e80a477  // ummla v23.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45f  // ummla v31.4s, v2.16b, v0.16b\n"
       "bge 206b\n"
       "207:"  // Height 6: Multiply loop: Skip odd blocks
       "cbz x27, 212f\n"
@@ -3194,42 +3193,42 @@
       "ldr b5, [x22, #0x0]\n"
       "ldr b6, [x21, #0x0]\n"
       "211:"  // Height 6: Multiply loop: Ragged operand read: Done
-      "ldr q7, [x10, #0x0]\n"
-      "trn1 v0.2d, v1.2d, v2.2d\n"
-      "trn1 v2.2d, v3.2d, v4.2d\n"
-      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
-      "trn1 v4.2d, v5.2d, v6.2d\n"
-      "ldr q6, [x10, #0x10]\n"
-      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a498  // ummla v24.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x20]\n"
-      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a49c  // ummla v28.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x30]\n"
-      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a499  // ummla v25.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x40]\n"
-      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a49d  // ummla v29.4s, v4.16b, v6.16b\n"
-      "ldr q6, [x10, #0x50]\n"
-      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a49a  // ummla v26.4s, v4.16b, v7.16b\n"
-      "ldr q7, [x10, #0x60]\n"
-      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a49e  // ummla v30.4s, v4.16b, v6.16b\n"
+      "ldr q0, [x10, #0x0]\n"
+      "trn1 v7.2d, v1.2d, v2.2d\n"
+      "trn1 v3.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e80a4e8  // ummla v8.4s, v7.16b, v0.16b\n"
+      "trn1 v2.2d, v5.2d, v6.2d\n"
+      "ldr q1, [x10, #0x10]\n"
+      ".inst 0x6e80a470  // ummla v16.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a458  // ummla v24.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x20]\n"
+      ".inst 0x6e81a4ec  // ummla v12.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e81a474  // ummla v20.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45c  // ummla v28.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x30]\n"
+      ".inst 0x6e80a4e9  // ummla v9.4s, v7.16b, v0.16b\n"
+      ".inst 0x6e80a471  // ummla v17.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a459  // ummla v25.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x40]\n"
+      ".inst 0x6e81a4ed  // ummla v13.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e81a475  // ummla v21.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45d  // ummla v29.4s, v2.16b, v1.16b\n"
+      "ldr q1, [x10, #0x50]\n"
+      ".inst 0x6e80a4ea  // ummla v10.4s, v7.16b, v0.16b\n"
+      ".inst 0x6e80a472  // ummla v18.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45a  // ummla v26.4s, v2.16b, v0.16b\n"
+      "ldr q0, [x10, #0x60]\n"
+      ".inst 0x6e81a4ee  // ummla v14.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e81a476  // ummla v22.4s, v3.16b, v1.16b\n"
+      ".inst 0x6e81a45e  // ummla v30.4s, v2.16b, v1.16b\n"
       "ldr q6, [x10, #0x70]\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e80a4eb  // ummla v11.4s, v7.16b, v0.16b\n"
       "add x10, x10, #0x80\n"
-      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e87a49b  // ummla v27.4s, v4.16b, v7.16b\n"
-      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e86a49f  // ummla v31.4s, v4.16b, v6.16b\n"
+      ".inst 0x6e80a473  // ummla v19.4s, v3.16b, v0.16b\n"
+      ".inst 0x6e80a45b  // ummla v27.4s, v2.16b, v0.16b\n"
+      ".inst 0x6e86a4ef  // ummla v15.4s, v7.16b, v6.16b\n"
+      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a45f  // ummla v31.4s, v2.16b, v6.16b\n"
       "212:"  // Height 6: Multiply loop: No odd multiplies
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -3440,7 +3439,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "224:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
index 153a4cc..25c5bf1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../bfloat.hpp"
 #include "../performance_parameters.hpp"
@@ -56,11 +56,6 @@
         return 12;
     }
 
-    static unsigned int stripe_width()
-    {
-        return 4;
-    }
-
     static constexpr unsigned int k_unroll()
     {
         return 2;
@@ -97,5 +92,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
index b3bde74..5684f46 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void a64_interleaved_bf16fp32_dot_8x12(
-    const bfloat16 *Apanel, const bfloat16 *Bpanel,
-    float *Cpanel, int ablocks, int bblocks, int K) {
+    const bfloat16 *Apanel,
+    const bfloat16 *Bpanel,
+    float *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -43,7 +47,6 @@
     ka.bblocks = bblocks;
 
     __asm__ __volatile__(
-
       "1:"  // Height loop
       "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
       "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
@@ -88,8 +91,8 @@
       "movi v31.16b, #0x0\n"
       "blt 4f\n"
       "3:"  // main loop head
-      "ldr q2, [%x[Apanel], #0x20]\n"
-      "ldr q3, [%x[Apanel], #0x30]\n"
+      "ldr q3, [%x[Apanel], #0x20]\n"
+      "ldr q7, [%x[Apanel], #0x30]\n"
       ".inst 0x4f40f088  // bfdot v8.4s, v4.8h, v0.h[0]\n"
       ".inst 0x4f60f08b  // bfdot v11.4s, v4.8h, v0.h[1]\n"
       ".inst 0x4f40f88e  // bfdot v14.4s, v4.8h, v0.h[2]\n"
@@ -123,35 +126,35 @@
       ".inst 0x4f61f0d9  // bfdot v25.4s, v6.8h, v1.h[1]\n"
       ".inst 0x4f41f8dc  // bfdot v28.4s, v6.8h, v1.h[2]\n"
       ".inst 0x4f61f8df  // bfdot v31.4s, v6.8h, v1.h[3]\n"
-      "ldr q6, [x22, #0x50]\n"
+      "ldr q2, [x22, #0x50]\n"
       "ldr q1, [%x[Apanel], #0x10]\n"
       "add x22, x22, #0x60\n"
-      ".inst 0x4f42f088  // bfdot v8.4s, v4.8h, v2.h[0]\n"
-      ".inst 0x4f62f08b  // bfdot v11.4s, v4.8h, v2.h[1]\n"
-      ".inst 0x4f42f88e  // bfdot v14.4s, v4.8h, v2.h[2]\n"
-      ".inst 0x4f62f891  // bfdot v17.4s, v4.8h, v2.h[3]\n"
-      ".inst 0x4f43f094  // bfdot v20.4s, v4.8h, v3.h[0]\n"
-      ".inst 0x4f63f097  // bfdot v23.4s, v4.8h, v3.h[1]\n"
-      ".inst 0x4f43f89a  // bfdot v26.4s, v4.8h, v3.h[2]\n"
-      ".inst 0x4f63f89d  // bfdot v29.4s, v4.8h, v3.h[3]\n"
+      ".inst 0x4f43f088  // bfdot v8.4s, v4.8h, v3.h[0]\n"
+      ".inst 0x4f63f08b  // bfdot v11.4s, v4.8h, v3.h[1]\n"
+      ".inst 0x4f43f88e  // bfdot v14.4s, v4.8h, v3.h[2]\n"
+      ".inst 0x4f63f891  // bfdot v17.4s, v4.8h, v3.h[3]\n"
+      ".inst 0x4f47f094  // bfdot v20.4s, v4.8h, v7.h[0]\n"
+      ".inst 0x4f67f097  // bfdot v23.4s, v4.8h, v7.h[1]\n"
+      ".inst 0x4f47f89a  // bfdot v26.4s, v4.8h, v7.h[2]\n"
+      ".inst 0x4f67f89d  // bfdot v29.4s, v4.8h, v7.h[3]\n"
       "ldr q4, [x22, #0x0]\n"
-      ".inst 0x4f42f0a9  // bfdot v9.4s, v5.8h, v2.h[0]\n"
-      ".inst 0x4f62f0ac  // bfdot v12.4s, v5.8h, v2.h[1]\n"
-      ".inst 0x4f42f8af  // bfdot v15.4s, v5.8h, v2.h[2]\n"
-      ".inst 0x4f62f8b2  // bfdot v18.4s, v5.8h, v2.h[3]\n"
-      ".inst 0x4f43f0b5  // bfdot v21.4s, v5.8h, v3.h[0]\n"
-      ".inst 0x4f63f0b8  // bfdot v24.4s, v5.8h, v3.h[1]\n"
-      ".inst 0x4f43f8bb  // bfdot v27.4s, v5.8h, v3.h[2]\n"
-      ".inst 0x4f63f8be  // bfdot v30.4s, v5.8h, v3.h[3]\n"
+      ".inst 0x4f43f0a9  // bfdot v9.4s, v5.8h, v3.h[0]\n"
+      ".inst 0x4f63f0ac  // bfdot v12.4s, v5.8h, v3.h[1]\n"
+      ".inst 0x4f43f8af  // bfdot v15.4s, v5.8h, v3.h[2]\n"
+      ".inst 0x4f63f8b2  // bfdot v18.4s, v5.8h, v3.h[3]\n"
+      ".inst 0x4f47f0b5  // bfdot v21.4s, v5.8h, v7.h[0]\n"
+      ".inst 0x4f67f0b8  // bfdot v24.4s, v5.8h, v7.h[1]\n"
+      ".inst 0x4f47f8bb  // bfdot v27.4s, v5.8h, v7.h[2]\n"
+      ".inst 0x4f67f8be  // bfdot v30.4s, v5.8h, v7.h[3]\n"
       "ldr q5, [x22, #0x10]\n"
-      ".inst 0x4f42f0ca  // bfdot v10.4s, v6.8h, v2.h[0]\n"
-      ".inst 0x4f62f0cd  // bfdot v13.4s, v6.8h, v2.h[1]\n"
-      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
-      ".inst 0x4f62f8d3  // bfdot v19.4s, v6.8h, v2.h[3]\n"
-      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
-      ".inst 0x4f63f0d9  // bfdot v25.4s, v6.8h, v3.h[1]\n"
-      ".inst 0x4f43f8dc  // bfdot v28.4s, v6.8h, v3.h[2]\n"
-      ".inst 0x4f63f8df  // bfdot v31.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f43f04a  // bfdot v10.4s, v2.8h, v3.h[0]\n"
+      ".inst 0x4f63f04d  // bfdot v13.4s, v2.8h, v3.h[1]\n"
+      ".inst 0x4f43f850  // bfdot v16.4s, v2.8h, v3.h[2]\n"
+      ".inst 0x4f63f853  // bfdot v19.4s, v2.8h, v3.h[3]\n"
+      ".inst 0x4f47f056  // bfdot v22.4s, v2.8h, v7.h[0]\n"
+      ".inst 0x4f67f059  // bfdot v25.4s, v2.8h, v7.h[1]\n"
+      ".inst 0x4f47f85c  // bfdot v28.4s, v2.8h, v7.h[2]\n"
+      ".inst 0x4f67f85f  // bfdot v31.4s, v2.8h, v7.h[3]\n"
       "ldr q6, [x22, #0x20]\n"
       "bge 3b\n"
       "4:"  // main loop skip
@@ -182,37 +185,37 @@
       ".inst 0x4f41f8dc  // bfdot v28.4s, v6.8h, v1.h[2]\n"
       ".inst 0x4f61f8df  // bfdot v31.4s, v6.8h, v1.h[3]\n"
       "cbz x20, 5f\n"
-      "ldr q0, [%x[Apanel], #0x0]\n"
-      "ldr q1, [%x[Apanel], #0x10]\n"
+      "ldr q4, [%x[Apanel], #0x0]\n"
+      "ldr q3, [%x[Apanel], #0x10]\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "ldr q7, [x22, #0x0]\n"
-      "ldr q4, [x22, #0x10]\n"
-      ".inst 0x4f40f0e8  // bfdot v8.4s, v7.8h, v0.h[0]\n"
-      "ldr q5, [x22, #0x20]\n"
-      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
-      ".inst 0x4f40f8ee  // bfdot v14.4s, v7.8h, v0.h[2]\n"
-      ".inst 0x4f60f8f1  // bfdot v17.4s, v7.8h, v0.h[3]\n"
-      ".inst 0x4f41f0f4  // bfdot v20.4s, v7.8h, v1.h[0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q1, [x22, #0x10]\n"
+      ".inst 0x4f44f048  // bfdot v8.4s, v2.8h, v4.h[0]\n"
+      "ldr q0, [x22, #0x20]\n"
+      ".inst 0x4f64f04b  // bfdot v11.4s, v2.8h, v4.h[1]\n"
+      ".inst 0x4f44f84e  // bfdot v14.4s, v2.8h, v4.h[2]\n"
+      ".inst 0x4f64f851  // bfdot v17.4s, v2.8h, v4.h[3]\n"
+      ".inst 0x4f43f054  // bfdot v20.4s, v2.8h, v3.h[0]\n"
       "add x22, x22, #0x30\n"
-      ".inst 0x4f61f0f7  // bfdot v23.4s, v7.8h, v1.h[1]\n"
-      ".inst 0x4f41f8fa  // bfdot v26.4s, v7.8h, v1.h[2]\n"
-      ".inst 0x4f61f8fd  // bfdot v29.4s, v7.8h, v1.h[3]\n"
-      ".inst 0x4f40f089  // bfdot v9.4s, v4.8h, v0.h[0]\n"
-      ".inst 0x4f60f08c  // bfdot v12.4s, v4.8h, v0.h[1]\n"
-      ".inst 0x4f40f88f  // bfdot v15.4s, v4.8h, v0.h[2]\n"
-      ".inst 0x4f60f892  // bfdot v18.4s, v4.8h, v0.h[3]\n"
-      ".inst 0x4f41f095  // bfdot v21.4s, v4.8h, v1.h[0]\n"
-      ".inst 0x4f61f098  // bfdot v24.4s, v4.8h, v1.h[1]\n"
-      ".inst 0x4f41f89b  // bfdot v27.4s, v4.8h, v1.h[2]\n"
-      ".inst 0x4f61f89e  // bfdot v30.4s, v4.8h, v1.h[3]\n"
-      ".inst 0x4f40f0aa  // bfdot v10.4s, v5.8h, v0.h[0]\n"
-      ".inst 0x4f60f0ad  // bfdot v13.4s, v5.8h, v0.h[1]\n"
-      ".inst 0x4f40f8b0  // bfdot v16.4s, v5.8h, v0.h[2]\n"
-      ".inst 0x4f60f8b3  // bfdot v19.4s, v5.8h, v0.h[3]\n"
-      ".inst 0x4f41f0b6  // bfdot v22.4s, v5.8h, v1.h[0]\n"
-      ".inst 0x4f61f0b9  // bfdot v25.4s, v5.8h, v1.h[1]\n"
-      ".inst 0x4f41f8bc  // bfdot v28.4s, v5.8h, v1.h[2]\n"
-      ".inst 0x4f61f8bf  // bfdot v31.4s, v5.8h, v1.h[3]\n"
+      ".inst 0x4f63f057  // bfdot v23.4s, v2.8h, v3.h[1]\n"
+      ".inst 0x4f43f85a  // bfdot v26.4s, v2.8h, v3.h[2]\n"
+      ".inst 0x4f63f85d  // bfdot v29.4s, v2.8h, v3.h[3]\n"
+      ".inst 0x4f44f029  // bfdot v9.4s, v1.8h, v4.h[0]\n"
+      ".inst 0x4f64f02c  // bfdot v12.4s, v1.8h, v4.h[1]\n"
+      ".inst 0x4f44f82f  // bfdot v15.4s, v1.8h, v4.h[2]\n"
+      ".inst 0x4f64f832  // bfdot v18.4s, v1.8h, v4.h[3]\n"
+      ".inst 0x4f43f035  // bfdot v21.4s, v1.8h, v3.h[0]\n"
+      ".inst 0x4f63f038  // bfdot v24.4s, v1.8h, v3.h[1]\n"
+      ".inst 0x4f43f83b  // bfdot v27.4s, v1.8h, v3.h[2]\n"
+      ".inst 0x4f63f83e  // bfdot v30.4s, v1.8h, v3.h[3]\n"
+      ".inst 0x4f44f00a  // bfdot v10.4s, v0.8h, v4.h[0]\n"
+      ".inst 0x4f64f00d  // bfdot v13.4s, v0.8h, v4.h[1]\n"
+      ".inst 0x4f44f810  // bfdot v16.4s, v0.8h, v4.h[2]\n"
+      ".inst 0x4f64f813  // bfdot v19.4s, v0.8h, v4.h[3]\n"
+      ".inst 0x4f43f016  // bfdot v22.4s, v0.8h, v3.h[0]\n"
+      ".inst 0x4f63f019  // bfdot v25.4s, v0.8h, v3.h[1]\n"
+      ".inst 0x4f43f81c  // bfdot v28.4s, v0.8h, v3.h[2]\n"
+      ".inst 0x4f63f81f  // bfdot v31.4s, v0.8h, v3.h[3]\n"
       "5:"  // multiply loop done
       "subs x23, x23, #0x1\n"
       "str q8, [%x[Cpanel], #0x0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
index 17c93fa..66c2b92 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../bfloat.hpp"
 #include "../performance_parameters.hpp"
@@ -57,11 +57,6 @@
         return 12;
     }
 
-    static unsigned int stripe_width()
-    {
-        return 4;
-    }
-
     static constexpr unsigned int k_unroll()
     {
         return 4;
@@ -117,5 +112,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
index cba29bc..bab687a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void a64_interleaved_bf16fp32_mmla_8x12_a510(
-    const bfloat16 *Apanel, const bfloat16 *Bpanel,
-    float *Cpanel, int ablocks, int bblocks, int K) {
+    const bfloat16 *Apanel,
+    const bfloat16 *Bpanel,
+    float *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -43,7 +47,6 @@
     ka.bblocks = bblocks;
 
     __asm__ __volatile__(
-
       "1:"  // Height loop
       "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
       "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
@@ -82,28 +85,28 @@
       "movi v31.16b, #0x0\n"
       "blt 4f\n"
       "3:"  // main loop head
-      "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.8h }, [%x[Apanel]], #0x10\n"
       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
       ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
-      "ldp q6, q7, [x22], #0x20\n"
+      "ldp q3, q7, [x22], #0x20\n"
       ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
       ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
       ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
       "sub x20, x20, #0x2\n"
       ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e44ec7a  // bfmmla v26.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e44ecda  // bfmmla v26.4s, v6.8h, v4.8h\n"
       "cmp x20, #0x2\n"
-      ".inst 0x6e45ec7d  // bfmmla v29.4s, v3.8h, v5.8h\n"
+      ".inst 0x6e45ecdd  // bfmmla v29.4s, v6.8h, v5.8h\n"
       "ldp q4, q5, [x22], #0x20\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e43ec09  // bfmmla v9.4s, v0.8h, v3.8h\n"
       ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e43ec2f  // bfmmla v15.4s, v1.8h, v3.8h\n"
       ".inst 0x6e47ec32  // bfmmla v18.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e43ec55  // bfmmla v21.4s, v2.8h, v3.8h\n"
       ".inst 0x6e47ec58  // bfmmla v24.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e46ec7b  // bfmmla v27.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e47ec7e  // bfmmla v30.4s, v3.8h, v7.8h\n"
-      "ldp q6, q7, [x22], #0x20\n"
+      ".inst 0x6e43ecdb  // bfmmla v27.4s, v6.8h, v3.8h\n"
+      ".inst 0x6e47ecde  // bfmmla v30.4s, v6.8h, v7.8h\n"
+      "ldp q7, q3, [x22], #0x20\n"
       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
       ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
       "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
@@ -113,39 +116,39 @@
       ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
       ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
       "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
-      ".inst 0x6e44ec7c  // bfmmla v28.4s, v3.8h, v4.8h\n"
-      ".inst 0x6e45ec7f  // bfmmla v31.4s, v3.8h, v5.8h\n"
-      "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e44ecdc  // bfmmla v28.4s, v6.8h, v4.8h\n"
+      ".inst 0x6e45ecdf  // bfmmla v31.4s, v6.8h, v5.8h\n"
+      "ld1 { v6.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
       "ldp q4, q5, [x22], #0x20\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e47ec31  // bfmmla v17.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e46ec7a  // bfmmla v26.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e47ec7d  // bfmmla v29.4s, v3.8h, v7.8h\n"
-      "ldp q6, q7, [x22], #0x20\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e47ec2e  // bfmmla v14.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e43ec31  // bfmmla v17.4s, v1.8h, v3.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec57  // bfmmla v23.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e47ecda  // bfmmla v26.4s, v6.8h, v7.8h\n"
+      ".inst 0x6e43ecdd  // bfmmla v29.4s, v6.8h, v3.8h\n"
+      "ldp q7, q3, [x22], #0x20\n"
       ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
       ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
       ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
       ".inst 0x6e45ec32  // bfmmla v18.4s, v1.8h, v5.8h\n"
       ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
       ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e44ec7b  // bfmmla v27.4s, v3.8h, v4.8h\n"
-      ".inst 0x6e45ec7e  // bfmmla v30.4s, v3.8h, v5.8h\n"
+      ".inst 0x6e44ecdb  // bfmmla v27.4s, v6.8h, v4.8h\n"
+      ".inst 0x6e45ecde  // bfmmla v30.4s, v6.8h, v5.8h\n"
       "ldp q4, q5, [x22], #0x20\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
       "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
-      ".inst 0x6e46ec30  // bfmmla v16.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e47ec33  // bfmmla v19.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec30  // bfmmla v16.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e43ec33  // bfmmla v19.4s, v1.8h, v3.8h\n"
       "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec59  // bfmmla v25.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
       "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
-      ".inst 0x6e46ec7c  // bfmmla v28.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e47ec7f  // bfmmla v31.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecdc  // bfmmla v28.4s, v6.8h, v7.8h\n"
+      ".inst 0x6e43ecdf  // bfmmla v31.4s, v6.8h, v3.8h\n"
       "bge 3b\n"
       "4:"  // main loop skip
       "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
@@ -158,7 +161,7 @@
       ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
       ".inst 0x6e44ec7a  // bfmmla v26.4s, v3.8h, v4.8h\n"
       ".inst 0x6e45ec7d  // bfmmla v29.4s, v3.8h, v5.8h\n"
-      "ldp q4, q5, [x22], #0x20\n"
+      "ldp q5, q4, [x22], #0x20\n"
       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
       ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
       ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
@@ -167,93 +170,93 @@
       ".inst 0x6e47ec58  // bfmmla v24.4s, v2.8h, v7.8h\n"
       ".inst 0x6e46ec7b  // bfmmla v27.4s, v3.8h, v6.8h\n"
       ".inst 0x6e47ec7e  // bfmmla v30.4s, v3.8h, v7.8h\n"
-      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e44ec30  // bfmmla v16.4s, v1.8h, v4.8h\n"
-      ".inst 0x6e45ec33  // bfmmla v19.4s, v1.8h, v5.8h\n"
-      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
-      ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e44ec7c  // bfmmla v28.4s, v3.8h, v4.8h\n"
-      ".inst 0x6e45ec7f  // bfmmla v31.4s, v3.8h, v5.8h\n"
+      ".inst 0x6e45ec0a  // bfmmla v10.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e44ec0d  // bfmmla v13.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec30  // bfmmla v16.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e44ec33  // bfmmla v19.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec56  // bfmmla v22.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec59  // bfmmla v25.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec7c  // bfmmla v28.4s, v3.8h, v5.8h\n"
+      ".inst 0x6e44ec7f  // bfmmla v31.4s, v3.8h, v4.8h\n"
       "cbz x20, 5f\n"
-      "ldp q6, q7, [x22], #0x20\n"
-      "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
-      "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
-      "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      "ldp q4, q5, [x22], #0x20\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e47ec31  // bfmmla v17.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e46ec7a  // bfmmla v26.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e47ec7d  // bfmmla v29.4s, v3.8h, v7.8h\n"
-      "ldp q6, q7, [x22], #0x20\n"
-      ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
-      ".inst 0x6e45ec32  // bfmmla v18.4s, v1.8h, v5.8h\n"
-      ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
-      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e44ec7b  // bfmmla v27.4s, v3.8h, v4.8h\n"
-      ".inst 0x6e45ec7e  // bfmmla v30.4s, v3.8h, v5.8h\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e46ec30  // bfmmla v16.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e47ec33  // bfmmla v19.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec59  // bfmmla v25.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e46ec7c  // bfmmla v28.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e47ec7f  // bfmmla v31.4s, v3.8h, v7.8h\n"
+      "ldp q1, q0, [x22], #0x20\n"
+      "ld1 { v7.8h }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e41ece8  // bfmmla v8.4s, v7.8h, v1.8h\n"
+      "ld1 { v5.8h }, [%x[Apanel]], #0x10\n"
+      "ld1 { v4.8h }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e40eceb  // bfmmla v11.4s, v7.8h, v0.8h\n"
+      "ldp q3, q2, [x22], #0x20\n"
+      ".inst 0x6e41ecce  // bfmmla v14.4s, v6.8h, v1.8h\n"
+      ".inst 0x6e40ecd1  // bfmmla v17.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e41ecb4  // bfmmla v20.4s, v5.8h, v1.8h\n"
+      ".inst 0x6e40ecb7  // bfmmla v23.4s, v5.8h, v0.8h\n"
+      ".inst 0x6e41ec9a  // bfmmla v26.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e40ec9d  // bfmmla v29.4s, v4.8h, v0.8h\n"
+      "ldp q1, q0, [x22], #0x20\n"
+      ".inst 0x6e43ece9  // bfmmla v9.4s, v7.8h, v3.8h\n"
+      ".inst 0x6e42ecec  // bfmmla v12.4s, v7.8h, v2.8h\n"
+      ".inst 0x6e43eccf  // bfmmla v15.4s, v6.8h, v3.8h\n"
+      ".inst 0x6e42ecd2  // bfmmla v18.4s, v6.8h, v2.8h\n"
+      ".inst 0x6e43ecb5  // bfmmla v21.4s, v5.8h, v3.8h\n"
+      ".inst 0x6e42ecb8  // bfmmla v24.4s, v5.8h, v2.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e42ec9e  // bfmmla v30.4s, v4.8h, v2.8h\n"
+      ".inst 0x6e41ecea  // bfmmla v10.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e40eced  // bfmmla v13.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e41ecd0  // bfmmla v16.4s, v6.8h, v1.8h\n"
+      ".inst 0x6e40ecd3  // bfmmla v19.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e41ecb6  // bfmmla v22.4s, v5.8h, v1.8h\n"
+      ".inst 0x6e40ecb9  // bfmmla v25.4s, v5.8h, v0.8h\n"
+      ".inst 0x6e41ec9c  // bfmmla v28.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e40ec9f  // bfmmla v31.4s, v4.8h, v0.8h\n"
       "5:"  // multiply loop done
       "subs x23, x23, #0x1\n"
-      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp1 v0.2d, v8.2d, v11.2d\n"
       "uzp2 v8.2d, v8.2d, v11.2d\n"
-      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp1 v1.2d, v9.2d, v12.2d\n"
       "uzp2 v9.2d, v9.2d, v12.2d\n"
-      "str q4, [%x[Cpanel], #0x0]\n"
-      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "str q0, [%x[Cpanel], #0x0]\n"
+      "uzp1 v0.2d, v10.2d, v13.2d\n"
       "uzp2 v10.2d, v10.2d, v13.2d\n"
-      "str q11, [%x[Cpanel], #0x10]\n"
-      "str q12, [%x[Cpanel], #0x20]\n"
-      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "str q1, [%x[Cpanel], #0x10]\n"
+      "str q0, [%x[Cpanel], #0x20]\n"
+      "uzp1 v0.2d, v14.2d, v17.2d\n"
       "uzp2 v14.2d, v14.2d, v17.2d\n"
       "str q8, [%x[Cpanel], #0x30]\n"
-      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp1 v2.2d, v15.2d, v18.2d\n"
       "uzp2 v15.2d, v15.2d, v18.2d\n"
       "str q9, [%x[Cpanel], #0x40]\n"
-      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp1 v17.2d, v16.2d, v19.2d\n"
       "uzp2 v16.2d, v16.2d, v19.2d\n"
       "str q10, [%x[Cpanel], #0x50]\n"
-      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp1 v1.2d, v20.2d, v23.2d\n"
       "uzp2 v20.2d, v20.2d, v23.2d\n"
-      "str q13, [%x[Cpanel], #0x60]\n"
-      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "str q0, [%x[Cpanel], #0x60]\n"
+      "uzp1 v0.2d, v21.2d, v24.2d\n"
       "uzp2 v21.2d, v21.2d, v24.2d\n"
-      "str q17, [%x[Cpanel], #0x70]\n"
-      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "str q2, [%x[Cpanel], #0x70]\n"
+      "uzp1 v23.2d, v22.2d, v25.2d\n"
       "uzp2 v22.2d, v22.2d, v25.2d\n"
-      "str q18, [%x[Cpanel], #0x80]\n"
-      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "str q17, [%x[Cpanel], #0x80]\n"
+      "uzp1 v19.2d, v26.2d, v29.2d\n"
       "uzp2 v26.2d, v26.2d, v29.2d\n"
       "str q14, [%x[Cpanel], #0x90]\n"
-      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp1 v18.2d, v27.2d, v30.2d\n"
       "uzp2 v27.2d, v27.2d, v30.2d\n"
       "str q15, [%x[Cpanel], #0xa0]\n"
-      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp1 v17.2d, v28.2d, v31.2d\n"
       "uzp2 v28.2d, v28.2d, v31.2d\n"
       "str q16, [%x[Cpanel], #0xb0]\n"
-      "str q19, [%x[Cpanel], #0xc0]\n"
-      "str q23, [%x[Cpanel], #0xd0]\n"
-      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q1, [%x[Cpanel], #0xc0]\n"
+      "str q0, [%x[Cpanel], #0xd0]\n"
+      "str q23, [%x[Cpanel], #0xe0]\n"
       "str q20, [%x[Cpanel], #0xf0]\n"
       "str q21, [%x[Cpanel], #0x100]\n"
       "str q22, [%x[Cpanel], #0x110]\n"
-      "str q25, [%x[Cpanel], #0x120]\n"
-      "str q29, [%x[Cpanel], #0x130]\n"
-      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q19, [%x[Cpanel], #0x120]\n"
+      "str q18, [%x[Cpanel], #0x130]\n"
+      "str q17, [%x[Cpanel], #0x140]\n"
       "str q26, [%x[Cpanel], #0x150]\n"
       "str q27, [%x[Cpanel], #0x160]\n"
       "str q28, [%x[Cpanel], #0x170]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
index 2938639..8485820 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void a64_interleaved_bf16fp32_mmla_8x12(
-    const bfloat16 *Apanel, const bfloat16 *Bpanel,
-    float *Cpanel, int ablocks, int bblocks, int K) {
+    const bfloat16 *Apanel,
+    const bfloat16 *Bpanel,
+    float *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -43,7 +47,6 @@
     ka.bblocks = bblocks;
 
     __asm__ __volatile__(
-
       "1:"  // Height loop
       "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
       "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
@@ -85,31 +88,31 @@
       "movi v31.16b, #0x0\n"
       "blt 4f\n"
       "3:"  // main loop head
-      "ldr q3, [%x[Apanel], #0x0]\n"
-      "ldr q6, [x22, #0x0]\n"
+      "ldr q6, [%x[Apanel], #0x0]\n"
+      "ldr q7, [x22, #0x0]\n"
       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
-      "ldr q7, [x22, #0x10]\n"
+      "ldr q3, [x22, #0x10]\n"
       ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
       ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
       ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
       ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
       "sub x20, x20, #0x2\n"
       ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e44ec7a  // bfmmla v26.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e44ecda  // bfmmla v26.4s, v6.8h, v4.8h\n"
       "ldr q4, [x22, #0x20]\n"
-      ".inst 0x6e45ec7d  // bfmmla v29.4s, v3.8h, v5.8h\n"
+      ".inst 0x6e45ecdd  // bfmmla v29.4s, v6.8h, v5.8h\n"
       "ldr q5, [x22, #0x30]\n"
-      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec0c  // bfmmla v12.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e47ec2f  // bfmmla v15.4s, v1.8h, v7.8h\n"
       "cmp x20, #0x2\n"
-      ".inst 0x6e47ec32  // bfmmla v18.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec58  // bfmmla v24.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e46ec7b  // bfmmla v27.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x22, #0x40]\n"
-      ".inst 0x6e47ec7e  // bfmmla v30.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x22, #0x50]\n"
+      ".inst 0x6e43ec32  // bfmmla v18.4s, v1.8h, v3.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec58  // bfmmla v24.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e47ecdb  // bfmmla v27.4s, v6.8h, v7.8h\n"
+      "ldr q7, [x22, #0x40]\n"
+      ".inst 0x6e43ecde  // bfmmla v30.4s, v6.8h, v3.8h\n"
+      "ldr q3, [x22, #0x50]\n"
       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
       ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
       "ldr q0, [%x[Apanel], #0x10]\n"
@@ -119,42 +122,42 @@
       ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
       ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
       "ldr q2, [%x[Apanel], #0x30]\n"
-      ".inst 0x6e44ec7c  // bfmmla v28.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e44ecdc  // bfmmla v28.4s, v6.8h, v4.8h\n"
       "ldr q4, [x22, #0x60]\n"
-      ".inst 0x6e45ec7f  // bfmmla v31.4s, v3.8h, v5.8h\n"
-      "ldr q3, [%x[Apanel], #0x40]\n"
+      ".inst 0x6e45ecdf  // bfmmla v31.4s, v6.8h, v5.8h\n"
+      "ldr q6, [%x[Apanel], #0x40]\n"
       "ldr q5, [x22, #0x70]\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e47ec31  // bfmmla v17.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e46ec7a  // bfmmla v26.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x22, #0x80]\n"
-      ".inst 0x6e47ec7d  // bfmmla v29.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x22, #0x90]\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec0b  // bfmmla v11.4s, v0.8h, v3.8h\n"
+      ".inst 0x6e47ec2e  // bfmmla v14.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e43ec31  // bfmmla v17.4s, v1.8h, v3.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec57  // bfmmla v23.4s, v2.8h, v3.8h\n"
+      ".inst 0x6e47ecda  // bfmmla v26.4s, v6.8h, v7.8h\n"
+      "ldr q7, [x22, #0x80]\n"
+      ".inst 0x6e43ecdd  // bfmmla v29.4s, v6.8h, v3.8h\n"
+      "ldr q3, [x22, #0x90]\n"
       ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
       ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
       ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
       ".inst 0x6e45ec32  // bfmmla v18.4s, v1.8h, v5.8h\n"
       ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
       ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e44ec7b  // bfmmla v27.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e44ecdb  // bfmmla v27.4s, v6.8h, v4.8h\n"
       "ldr q4, [x22, #0xa0]\n"
-      ".inst 0x6e45ec7e  // bfmmla v30.4s, v3.8h, v5.8h\n"
+      ".inst 0x6e45ecde  // bfmmla v30.4s, v6.8h, v5.8h\n"
       "ldr q5, [x22, #0xb0]\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e43ec0d  // bfmmla v13.4s, v0.8h, v3.8h\n"
       "ldr q0, [%x[Apanel], #0x50]\n"
-      ".inst 0x6e46ec30  // bfmmla v16.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e47ec33  // bfmmla v19.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec30  // bfmmla v16.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e43ec33  // bfmmla v19.4s, v1.8h, v3.8h\n"
       "ldr q1, [%x[Apanel], #0x60]\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec59  // bfmmla v25.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e43ec59  // bfmmla v25.4s, v2.8h, v3.8h\n"
       "ldr q2, [%x[Apanel], #0x70]\n"
-      ".inst 0x6e46ec7c  // bfmmla v28.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e47ec7f  // bfmmla v31.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecdc  // bfmmla v28.4s, v6.8h, v7.8h\n"
+      ".inst 0x6e43ecdf  // bfmmla v31.4s, v6.8h, v3.8h\n"
       "add %x[Apanel], %x[Apanel], #0x80\n"
       "add x22, x22, #0xc0\n"
       "bge 3b\n"
@@ -191,89 +194,89 @@
       ".inst 0x6e44ec7c  // bfmmla v28.4s, v3.8h, v4.8h\n"
       ".inst 0x6e45ec7f  // bfmmla v31.4s, v3.8h, v5.8h\n"
       "cbz x20, 5f\n"
-      "ldr q6, [x22, #0x0]\n"
-      "ldr q0, [%x[Apanel], #0x0]\n"
-      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
-      "ldr q1, [%x[Apanel], #0x10]\n"
-      "ldr q7, [x22, #0x10]\n"
-      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
-      "ldr q2, [%x[Apanel], #0x20]\n"
-      "ldr q3, [%x[Apanel], #0x30]\n"
-      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
-      "ldr q4, [x22, #0x20]\n"
-      "ldr q5, [x22, #0x30]\n"
-      ".inst 0x6e47ec31  // bfmmla v17.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      "ldr q1, [x22, #0x0]\n"
+      "ldr q7, [%x[Apanel], #0x0]\n"
+      ".inst 0x6e41ece8  // bfmmla v8.4s, v7.8h, v1.8h\n"
+      "ldr q6, [%x[Apanel], #0x10]\n"
+      "ldr q0, [x22, #0x10]\n"
+      ".inst 0x6e40eceb  // bfmmla v11.4s, v7.8h, v0.8h\n"
+      "ldr q5, [%x[Apanel], #0x20]\n"
+      "ldr q4, [%x[Apanel], #0x30]\n"
+      ".inst 0x6e41ecce  // bfmmla v14.4s, v6.8h, v1.8h\n"
+      "ldr q3, [x22, #0x20]\n"
+      "ldr q2, [x22, #0x30]\n"
+      ".inst 0x6e40ecd1  // bfmmla v17.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e41ecb4  // bfmmla v20.4s, v5.8h, v1.8h\n"
+      ".inst 0x6e40ecb7  // bfmmla v23.4s, v5.8h, v0.8h\n"
       "add %x[Apanel], %x[Apanel], #0x40\n"
-      ".inst 0x6e46ec7a  // bfmmla v26.4s, v3.8h, v6.8h\n"
-      "ldr q6, [x22, #0x40]\n"
-      ".inst 0x6e47ec7d  // bfmmla v29.4s, v3.8h, v7.8h\n"
-      "ldr q7, [x22, #0x50]\n"
-      ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
-      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
-      ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
-      ".inst 0x6e45ec32  // bfmmla v18.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e41ec9a  // bfmmla v26.4s, v4.8h, v1.8h\n"
+      "ldr q1, [x22, #0x40]\n"
+      ".inst 0x6e40ec9d  // bfmmla v29.4s, v4.8h, v0.8h\n"
+      "ldr q0, [x22, #0x50]\n"
+      ".inst 0x6e43ece9  // bfmmla v9.4s, v7.8h, v3.8h\n"
+      ".inst 0x6e42ecec  // bfmmla v12.4s, v7.8h, v2.8h\n"
+      ".inst 0x6e43eccf  // bfmmla v15.4s, v6.8h, v3.8h\n"
+      ".inst 0x6e42ecd2  // bfmmla v18.4s, v6.8h, v2.8h\n"
       "add x22, x22, #0x60\n"
-      ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
-      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
-      ".inst 0x6e44ec7b  // bfmmla v27.4s, v3.8h, v4.8h\n"
-      ".inst 0x6e45ec7e  // bfmmla v30.4s, v3.8h, v5.8h\n"
-      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
-      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
-      ".inst 0x6e46ec30  // bfmmla v16.4s, v1.8h, v6.8h\n"
-      ".inst 0x6e47ec33  // bfmmla v19.4s, v1.8h, v7.8h\n"
-      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
-      ".inst 0x6e47ec59  // bfmmla v25.4s, v2.8h, v7.8h\n"
-      ".inst 0x6e46ec7c  // bfmmla v28.4s, v3.8h, v6.8h\n"
-      ".inst 0x6e47ec7f  // bfmmla v31.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e43ecb5  // bfmmla v21.4s, v5.8h, v3.8h\n"
+      ".inst 0x6e42ecb8  // bfmmla v24.4s, v5.8h, v2.8h\n"
+      ".inst 0x6e43ec9b  // bfmmla v27.4s, v4.8h, v3.8h\n"
+      ".inst 0x6e42ec9e  // bfmmla v30.4s, v4.8h, v2.8h\n"
+      ".inst 0x6e41ecea  // bfmmla v10.4s, v7.8h, v1.8h\n"
+      ".inst 0x6e40eced  // bfmmla v13.4s, v7.8h, v0.8h\n"
+      ".inst 0x6e41ecd0  // bfmmla v16.4s, v6.8h, v1.8h\n"
+      ".inst 0x6e40ecd3  // bfmmla v19.4s, v6.8h, v0.8h\n"
+      ".inst 0x6e41ecb6  // bfmmla v22.4s, v5.8h, v1.8h\n"
+      ".inst 0x6e40ecb9  // bfmmla v25.4s, v5.8h, v0.8h\n"
+      ".inst 0x6e41ec9c  // bfmmla v28.4s, v4.8h, v1.8h\n"
+      ".inst 0x6e40ec9f  // bfmmla v31.4s, v4.8h, v0.8h\n"
       "5:"  // multiply loop done
       "subs x23, x23, #0x1\n"
-      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp1 v0.2d, v8.2d, v11.2d\n"
       "uzp2 v8.2d, v8.2d, v11.2d\n"
-      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp1 v1.2d, v9.2d, v12.2d\n"
       "uzp2 v9.2d, v9.2d, v12.2d\n"
-      "str q4, [%x[Cpanel], #0x0]\n"
-      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "str q0, [%x[Cpanel], #0x0]\n"
+      "uzp1 v0.2d, v10.2d, v13.2d\n"
       "uzp2 v10.2d, v10.2d, v13.2d\n"
-      "str q11, [%x[Cpanel], #0x10]\n"
-      "str q12, [%x[Cpanel], #0x20]\n"
-      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "str q1, [%x[Cpanel], #0x10]\n"
+      "str q0, [%x[Cpanel], #0x20]\n"
+      "uzp1 v0.2d, v14.2d, v17.2d\n"
       "uzp2 v14.2d, v14.2d, v17.2d\n"
       "str q8, [%x[Cpanel], #0x30]\n"
-      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp1 v2.2d, v15.2d, v18.2d\n"
       "uzp2 v15.2d, v15.2d, v18.2d\n"
       "str q9, [%x[Cpanel], #0x40]\n"
-      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp1 v17.2d, v16.2d, v19.2d\n"
       "uzp2 v16.2d, v16.2d, v19.2d\n"
       "str q10, [%x[Cpanel], #0x50]\n"
-      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp1 v1.2d, v20.2d, v23.2d\n"
       "uzp2 v20.2d, v20.2d, v23.2d\n"
-      "str q13, [%x[Cpanel], #0x60]\n"
-      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "str q0, [%x[Cpanel], #0x60]\n"
+      "uzp1 v0.2d, v21.2d, v24.2d\n"
       "uzp2 v21.2d, v21.2d, v24.2d\n"
-      "str q17, [%x[Cpanel], #0x70]\n"
-      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "str q2, [%x[Cpanel], #0x70]\n"
+      "uzp1 v23.2d, v22.2d, v25.2d\n"
       "uzp2 v22.2d, v22.2d, v25.2d\n"
-      "str q18, [%x[Cpanel], #0x80]\n"
-      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "str q17, [%x[Cpanel], #0x80]\n"
+      "uzp1 v19.2d, v26.2d, v29.2d\n"
       "uzp2 v26.2d, v26.2d, v29.2d\n"
       "str q14, [%x[Cpanel], #0x90]\n"
-      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp1 v18.2d, v27.2d, v30.2d\n"
       "uzp2 v27.2d, v27.2d, v30.2d\n"
       "str q15, [%x[Cpanel], #0xa0]\n"
-      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp1 v17.2d, v28.2d, v31.2d\n"
       "uzp2 v28.2d, v28.2d, v31.2d\n"
       "str q16, [%x[Cpanel], #0xb0]\n"
-      "str q19, [%x[Cpanel], #0xc0]\n"
-      "str q23, [%x[Cpanel], #0xd0]\n"
-      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q1, [%x[Cpanel], #0xc0]\n"
+      "str q0, [%x[Cpanel], #0xd0]\n"
+      "str q23, [%x[Cpanel], #0xe0]\n"
       "str q20, [%x[Cpanel], #0xf0]\n"
       "str q21, [%x[Cpanel], #0x100]\n"
       "str q22, [%x[Cpanel], #0x110]\n"
-      "str q25, [%x[Cpanel], #0x120]\n"
-      "str q29, [%x[Cpanel], #0x130]\n"
-      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q19, [%x[Cpanel], #0x120]\n"
+      "str q18, [%x[Cpanel], #0x130]\n"
+      "str q17, [%x[Cpanel], #0x140]\n"
       "str q26, [%x[Cpanel], #0x150]\n"
       "str q27, [%x[Cpanel], #0x160]\n"
       "str q28, [%x[Cpanel], #0x170]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
index 4cc3ed0..37a54fc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -56,11 +56,6 @@
         return 12;
     }
 
-    static unsigned int stripe_width()
-    {
-        return 4;
-    }
-
     static constexpr unsigned int k_unroll()
     {
         return 8;
@@ -111,11 +106,9 @@
                 break;
         }
     }
-
 };
 
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
index e46cb8a..c1d3738 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void a64_interleaved_s8s32_mmla_8x12_a510(
-    const int8_t *Apanel, const int8_t *Bpanel,
-    int32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const int8_t *Apanel,
+    const int8_t *Bpanel,
+    int32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -43,7 +47,6 @@
     ka.bblocks = bblocks;
 
     __asm__ __volatile__(
-
       "1:"  // Height loop
       "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
       "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
@@ -82,28 +85,28 @@
       "movi v31.4s, #0x0\n"
       "blt 4f\n"
       "3:"  // main loop head
-      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
       ".inst 0x4e84a408  // smmla v8.4s, v0.16b, v4.16b\n"
       ".inst 0x4e85a40b  // smmla v11.4s, v0.16b, v5.16b\n"
-      "ldp q6, q7, [x22], #0x20\n"
+      "ldp q3, q7, [x22], #0x20\n"
       ".inst 0x4e84a42e  // smmla v14.4s, v1.16b, v4.16b\n"
       ".inst 0x4e85a431  // smmla v17.4s, v1.16b, v5.16b\n"
       ".inst 0x4e84a454  // smmla v20.4s, v2.16b, v4.16b\n"
       "sub x20, x20, #0x2\n"
       ".inst 0x4e85a457  // smmla v23.4s, v2.16b, v5.16b\n"
-      ".inst 0x4e84a47a  // smmla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e84a4da  // smmla v26.4s, v6.16b, v4.16b\n"
       "cmp x20, #0x2\n"
-      ".inst 0x4e85a47d  // smmla v29.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e85a4dd  // smmla v29.4s, v6.16b, v5.16b\n"
       "ldp q4, q5, [x22], #0x20\n"
-      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e83a409  // smmla v9.4s, v0.16b, v3.16b\n"
       ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e83a42f  // smmla v15.4s, v1.16b, v3.16b\n"
       ".inst 0x4e87a432  // smmla v18.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e83a455  // smmla v21.4s, v2.16b, v3.16b\n"
       ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e86a47b  // smmla v27.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e87a47e  // smmla v30.4s, v3.16b, v7.16b\n"
-      "ldp q6, q7, [x22], #0x20\n"
+      ".inst 0x4e83a4db  // smmla v27.4s, v6.16b, v3.16b\n"
+      ".inst 0x4e87a4de  // smmla v30.4s, v6.16b, v7.16b\n"
+      "ldp q7, q3, [x22], #0x20\n"
       ".inst 0x4e84a40a  // smmla v10.4s, v0.16b, v4.16b\n"
       ".inst 0x4e85a40d  // smmla v13.4s, v0.16b, v5.16b\n"
       "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
@@ -113,39 +116,39 @@
       ".inst 0x4e84a456  // smmla v22.4s, v2.16b, v4.16b\n"
       ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
       "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
-      ".inst 0x4e84a47c  // smmla v28.4s, v3.16b, v4.16b\n"
-      ".inst 0x4e85a47f  // smmla v31.4s, v3.16b, v5.16b\n"
-      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e84a4dc  // smmla v28.4s, v6.16b, v4.16b\n"
+      ".inst 0x4e85a4df  // smmla v31.4s, v6.16b, v5.16b\n"
+      "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
       "ldp q4, q5, [x22], #0x20\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e87a431  // smmla v17.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e86a47a  // smmla v26.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e87a47d  // smmla v29.4s, v3.16b, v7.16b\n"
-      "ldp q6, q7, [x22], #0x20\n"
+      ".inst 0x4e83a40b  // smmla v11.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e87a42e  // smmla v14.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e83a431  // smmla v17.4s, v1.16b, v3.16b\n"
+      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e83a457  // smmla v23.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e87a4da  // smmla v26.4s, v6.16b, v7.16b\n"
+      ".inst 0x4e83a4dd  // smmla v29.4s, v6.16b, v3.16b\n"
+      "ldp q7, q3, [x22], #0x20\n"
       ".inst 0x4e84a409  // smmla v9.4s, v0.16b, v4.16b\n"
       ".inst 0x4e85a40c  // smmla v12.4s, v0.16b, v5.16b\n"
       ".inst 0x4e84a42f  // smmla v15.4s, v1.16b, v4.16b\n"
       ".inst 0x4e85a432  // smmla v18.4s, v1.16b, v5.16b\n"
       ".inst 0x4e84a455  // smmla v21.4s, v2.16b, v4.16b\n"
       ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
-      ".inst 0x4e84a47b  // smmla v27.4s, v3.16b, v4.16b\n"
-      ".inst 0x4e85a47e  // smmla v30.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e84a4db  // smmla v27.4s, v6.16b, v4.16b\n"
+      ".inst 0x4e85a4de  // smmla v30.4s, v6.16b, v5.16b\n"
       "ldp q4, q5, [x22], #0x20\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e83a40d  // smmla v13.4s, v0.16b, v3.16b\n"
       "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
-      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e87a433  // smmla v19.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a430  // smmla v16.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e83a433  // smmla v19.4s, v1.16b, v3.16b\n"
       "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e83a459  // smmla v25.4s, v2.16b, v3.16b\n"
       "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
-      ".inst 0x4e86a47c  // smmla v28.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e87a47f  // smmla v31.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4dc  // smmla v28.4s, v6.16b, v7.16b\n"
+      ".inst 0x4e83a4df  // smmla v31.4s, v6.16b, v3.16b\n"
       "bge 3b\n"
       "4:"  // main loop skip
       "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
@@ -158,7 +161,7 @@
       ".inst 0x4e85a457  // smmla v23.4s, v2.16b, v5.16b\n"
       ".inst 0x4e84a47a  // smmla v26.4s, v3.16b, v4.16b\n"
       ".inst 0x4e85a47d  // smmla v29.4s, v3.16b, v5.16b\n"
-      "ldp q4, q5, [x22], #0x20\n"
+      "ldp q5, q4, [x22], #0x20\n"
       ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
       ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
       ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
@@ -167,93 +170,93 @@
       ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
       ".inst 0x4e86a47b  // smmla v27.4s, v3.16b, v6.16b\n"
       ".inst 0x4e87a47e  // smmla v30.4s, v3.16b, v7.16b\n"
-      ".inst 0x4e84a40a  // smmla v10.4s, v0.16b, v4.16b\n"
-      ".inst 0x4e85a40d  // smmla v13.4s, v0.16b, v5.16b\n"
-      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
-      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
-      ".inst 0x4e84a456  // smmla v22.4s, v2.16b, v4.16b\n"
-      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
-      ".inst 0x4e84a47c  // smmla v28.4s, v3.16b, v4.16b\n"
-      ".inst 0x4e85a47f  // smmla v31.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e85a40a  // smmla v10.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e84a40d  // smmla v13.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a430  // smmla v16.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e84a433  // smmla v19.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a456  // smmla v22.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a459  // smmla v25.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a47c  // smmla v28.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e84a47f  // smmla v31.4s, v3.16b, v4.16b\n"
       "cbz x20, 5f\n"
-      "ldp q6, q7, [x22], #0x20\n"
-      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
-      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
-      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      "ldp q4, q5, [x22], #0x20\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e87a431  // smmla v17.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e86a47a  // smmla v26.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e87a47d  // smmla v29.4s, v3.16b, v7.16b\n"
-      "ldp q6, q7, [x22], #0x20\n"
-      ".inst 0x4e84a409  // smmla v9.4s, v0.16b, v4.16b\n"
-      ".inst 0x4e85a40c  // smmla v12.4s, v0.16b, v5.16b\n"
-      ".inst 0x4e84a42f  // smmla v15.4s, v1.16b, v4.16b\n"
-      ".inst 0x4e85a432  // smmla v18.4s, v1.16b, v5.16b\n"
-      ".inst 0x4e84a455  // smmla v21.4s, v2.16b, v4.16b\n"
-      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
-      ".inst 0x4e84a47b  // smmla v27.4s, v3.16b, v4.16b\n"
-      ".inst 0x4e85a47e  // smmla v30.4s, v3.16b, v5.16b\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e87a433  // smmla v19.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e86a47c  // smmla v28.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e87a47f  // smmla v31.4s, v3.16b, v7.16b\n"
+      "ldp q1, q0, [x22], #0x20\n"
+      "ld1 { v7.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e81a4e8  // smmla v8.4s, v7.16b, v1.16b\n"
+      "ld1 { v5.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v4.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x4e80a4eb  // smmla v11.4s, v7.16b, v0.16b\n"
+      "ldp q3, q2, [x22], #0x20\n"
+      ".inst 0x4e81a4ce  // smmla v14.4s, v6.16b, v1.16b\n"
+      ".inst 0x4e80a4d1  // smmla v17.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e81a4b4  // smmla v20.4s, v5.16b, v1.16b\n"
+      ".inst 0x4e80a4b7  // smmla v23.4s, v5.16b, v0.16b\n"
+      ".inst 0x4e81a49a  // smmla v26.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e80a49d  // smmla v29.4s, v4.16b, v0.16b\n"
+      "ldp q1, q0, [x22], #0x20\n"
+      ".inst 0x4e83a4e9  // smmla v9.4s, v7.16b, v3.16b\n"
+      ".inst 0x4e82a4ec  // smmla v12.4s, v7.16b, v2.16b\n"
+      ".inst 0x4e83a4cf  // smmla v15.4s, v6.16b, v3.16b\n"
+      ".inst 0x4e82a4d2  // smmla v18.4s, v6.16b, v2.16b\n"
+      ".inst 0x4e83a4b5  // smmla v21.4s, v5.16b, v3.16b\n"
+      ".inst 0x4e82a4b8  // smmla v24.4s, v5.16b, v2.16b\n"
+      ".inst 0x4e83a49b  // smmla v27.4s, v4.16b, v3.16b\n"
+      ".inst 0x4e82a49e  // smmla v30.4s, v4.16b, v2.16b\n"
+      ".inst 0x4e81a4ea  // smmla v10.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e80a4ed  // smmla v13.4s, v7.16b, v0.16b\n"
+      ".inst 0x4e81a4d0  // smmla v16.4s, v6.16b, v1.16b\n"
+      ".inst 0x4e80a4d3  // smmla v19.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e81a4b6  // smmla v22.4s, v5.16b, v1.16b\n"
+      ".inst 0x4e80a4b9  // smmla v25.4s, v5.16b, v0.16b\n"
+      ".inst 0x4e81a49c  // smmla v28.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e80a49f  // smmla v31.4s, v4.16b, v0.16b\n"
       "5:"  // multiply loop done
       "subs x23, x23, #0x1\n"
-      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp1 v0.2d, v8.2d, v11.2d\n"
       "uzp2 v8.2d, v8.2d, v11.2d\n"
-      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp1 v1.2d, v9.2d, v12.2d\n"
       "uzp2 v9.2d, v9.2d, v12.2d\n"
-      "str q4, [%x[Cpanel], #0x0]\n"
-      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "str q0, [%x[Cpanel], #0x0]\n"
+      "uzp1 v0.2d, v10.2d, v13.2d\n"
       "uzp2 v10.2d, v10.2d, v13.2d\n"
-      "str q11, [%x[Cpanel], #0x10]\n"
-      "str q12, [%x[Cpanel], #0x20]\n"
-      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "str q1, [%x[Cpanel], #0x10]\n"
+      "str q0, [%x[Cpanel], #0x20]\n"
+      "uzp1 v0.2d, v14.2d, v17.2d\n"
       "uzp2 v14.2d, v14.2d, v17.2d\n"
       "str q8, [%x[Cpanel], #0x30]\n"
-      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp1 v2.2d, v15.2d, v18.2d\n"
       "uzp2 v15.2d, v15.2d, v18.2d\n"
       "str q9, [%x[Cpanel], #0x40]\n"
-      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp1 v17.2d, v16.2d, v19.2d\n"
       "uzp2 v16.2d, v16.2d, v19.2d\n"
       "str q10, [%x[Cpanel], #0x50]\n"
-      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp1 v1.2d, v20.2d, v23.2d\n"
       "uzp2 v20.2d, v20.2d, v23.2d\n"
-      "str q13, [%x[Cpanel], #0x60]\n"
-      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "str q0, [%x[Cpanel], #0x60]\n"
+      "uzp1 v0.2d, v21.2d, v24.2d\n"
       "uzp2 v21.2d, v21.2d, v24.2d\n"
-      "str q17, [%x[Cpanel], #0x70]\n"
-      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "str q2, [%x[Cpanel], #0x70]\n"
+      "uzp1 v23.2d, v22.2d, v25.2d\n"
       "uzp2 v22.2d, v22.2d, v25.2d\n"
-      "str q18, [%x[Cpanel], #0x80]\n"
-      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "str q17, [%x[Cpanel], #0x80]\n"
+      "uzp1 v19.2d, v26.2d, v29.2d\n"
       "uzp2 v26.2d, v26.2d, v29.2d\n"
       "str q14, [%x[Cpanel], #0x90]\n"
-      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp1 v18.2d, v27.2d, v30.2d\n"
       "uzp2 v27.2d, v27.2d, v30.2d\n"
       "str q15, [%x[Cpanel], #0xa0]\n"
-      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp1 v17.2d, v28.2d, v31.2d\n"
       "uzp2 v28.2d, v28.2d, v31.2d\n"
       "str q16, [%x[Cpanel], #0xb0]\n"
-      "str q19, [%x[Cpanel], #0xc0]\n"
-      "str q23, [%x[Cpanel], #0xd0]\n"
-      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q1, [%x[Cpanel], #0xc0]\n"
+      "str q0, [%x[Cpanel], #0xd0]\n"
+      "str q23, [%x[Cpanel], #0xe0]\n"
       "str q20, [%x[Cpanel], #0xf0]\n"
       "str q21, [%x[Cpanel], #0x100]\n"
       "str q22, [%x[Cpanel], #0x110]\n"
-      "str q25, [%x[Cpanel], #0x120]\n"
-      "str q29, [%x[Cpanel], #0x130]\n"
-      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q19, [%x[Cpanel], #0x120]\n"
+      "str q18, [%x[Cpanel], #0x130]\n"
+      "str q17, [%x[Cpanel], #0x140]\n"
       "str q26, [%x[Cpanel], #0x150]\n"
       "str q27, [%x[Cpanel], #0x160]\n"
       "str q28, [%x[Cpanel], #0x170]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
index fc20c2f..a097dc3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void a64_interleaved_s8s32_mmla_8x12(
-    const int8_t *Apanel, const int8_t *Bpanel,
-    int32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const int8_t *Apanel,
+    const int8_t *Bpanel,
+    int32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -43,7 +47,6 @@
     ka.bblocks = bblocks;
 
     __asm__ __volatile__(
-
       "1:"  // Height loop
       "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
       "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
@@ -85,31 +88,31 @@
       "movi v31.4s, #0x0\n"
       "blt 4f\n"
       "3:"  // main loop head
-      "ldr q3, [%x[Apanel], #0x0]\n"
-      "ldr q6, [x22, #0x0]\n"
+      "ldr q6, [%x[Apanel], #0x0]\n"
+      "ldr q7, [x22, #0x0]\n"
       ".inst 0x4e84a408  // smmla v8.4s, v0.16b, v4.16b\n"
-      "ldr q7, [x22, #0x10]\n"
+      "ldr q3, [x22, #0x10]\n"
       ".inst 0x4e85a40b  // smmla v11.4s, v0.16b, v5.16b\n"
       ".inst 0x4e84a42e  // smmla v14.4s, v1.16b, v4.16b\n"
       ".inst 0x4e85a431  // smmla v17.4s, v1.16b, v5.16b\n"
       ".inst 0x4e84a454  // smmla v20.4s, v2.16b, v4.16b\n"
       "sub x20, x20, #0x2\n"
       ".inst 0x4e85a457  // smmla v23.4s, v2.16b, v5.16b\n"
-      ".inst 0x4e84a47a  // smmla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e84a4da  // smmla v26.4s, v6.16b, v4.16b\n"
       "ldr q4, [x22, #0x20]\n"
-      ".inst 0x4e85a47d  // smmla v29.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e85a4dd  // smmla v29.4s, v6.16b, v5.16b\n"
       "ldr q5, [x22, #0x30]\n"
-      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e83a40c  // smmla v12.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e87a42f  // smmla v15.4s, v1.16b, v7.16b\n"
       "cmp x20, #0x2\n"
-      ".inst 0x4e87a432  // smmla v18.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e86a47b  // smmla v27.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x22, #0x40]\n"
-      ".inst 0x4e87a47e  // smmla v30.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x22, #0x50]\n"
+      ".inst 0x4e83a432  // smmla v18.4s, v1.16b, v3.16b\n"
+      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e83a458  // smmla v24.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e87a4db  // smmla v27.4s, v6.16b, v7.16b\n"
+      "ldr q7, [x22, #0x40]\n"
+      ".inst 0x4e83a4de  // smmla v30.4s, v6.16b, v3.16b\n"
+      "ldr q3, [x22, #0x50]\n"
       ".inst 0x4e84a40a  // smmla v10.4s, v0.16b, v4.16b\n"
       ".inst 0x4e85a40d  // smmla v13.4s, v0.16b, v5.16b\n"
       "ldr q0, [%x[Apanel], #0x10]\n"
@@ -119,42 +122,42 @@
       ".inst 0x4e84a456  // smmla v22.4s, v2.16b, v4.16b\n"
       ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
       "ldr q2, [%x[Apanel], #0x30]\n"
-      ".inst 0x4e84a47c  // smmla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e84a4dc  // smmla v28.4s, v6.16b, v4.16b\n"
       "ldr q4, [x22, #0x60]\n"
-      ".inst 0x4e85a47f  // smmla v31.4s, v3.16b, v5.16b\n"
-      "ldr q3, [%x[Apanel], #0x40]\n"
+      ".inst 0x4e85a4df  // smmla v31.4s, v6.16b, v5.16b\n"
+      "ldr q6, [%x[Apanel], #0x40]\n"
       "ldr q5, [x22, #0x70]\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e87a431  // smmla v17.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e86a47a  // smmla v26.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x22, #0x80]\n"
-      ".inst 0x4e87a47d  // smmla v29.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x22, #0x90]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e83a40b  // smmla v11.4s, v0.16b, v3.16b\n"
+      ".inst 0x4e87a42e  // smmla v14.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e83a431  // smmla v17.4s, v1.16b, v3.16b\n"
+      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e83a457  // smmla v23.4s, v2.16b, v3.16b\n"
+      ".inst 0x4e87a4da  // smmla v26.4s, v6.16b, v7.16b\n"
+      "ldr q7, [x22, #0x80]\n"
+      ".inst 0x4e83a4dd  // smmla v29.4s, v6.16b, v3.16b\n"
+      "ldr q3, [x22, #0x90]\n"
       ".inst 0x4e84a409  // smmla v9.4s, v0.16b, v4.16b\n"
       ".inst 0x4e85a40c  // smmla v12.4s, v0.16b, v5.16b\n"
       ".inst 0x4e84a42f  // smmla v15.4s, v1.16b, v4.16b\n"
       ".inst 0x4e85a432  // smmla v18.4s, v1.16b, v5.16b\n"
       ".inst 0x4e84a455  // smmla v21.4s, v2.16b, v4.16b\n"
       ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
-      ".inst 0x4e84a47b  // smmla v27.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e84a4db  // smmla v27.4s, v6.16b, v4.16b\n"
       "ldr q4, [x22, #0xa0]\n"
-      ".inst 0x4e85a47e  // smmla v30.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e85a4de  // smmla v30.4s, v6.16b, v5.16b\n"
       "ldr q5, [x22, #0xb0]\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e83a40d  // smmla v13.4s, v0.16b, v3.16b\n"
       "ldr q0, [%x[Apanel], #0x50]\n"
-      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e87a433  // smmla v19.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a430  // smmla v16.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e83a433  // smmla v19.4s, v1.16b, v3.16b\n"
       "ldr q1, [%x[Apanel], #0x60]\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e83a459  // smmla v25.4s, v2.16b, v3.16b\n"
       "ldr q2, [%x[Apanel], #0x70]\n"
-      ".inst 0x4e86a47c  // smmla v28.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e87a47f  // smmla v31.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4dc  // smmla v28.4s, v6.16b, v7.16b\n"
+      ".inst 0x4e83a4df  // smmla v31.4s, v6.16b, v3.16b\n"
       "add %x[Apanel], %x[Apanel], #0x80\n"
       "add x22, x22, #0xc0\n"
       "bge 3b\n"
@@ -191,89 +194,89 @@
       ".inst 0x4e84a47c  // smmla v28.4s, v3.16b, v4.16b\n"
       ".inst 0x4e85a47f  // smmla v31.4s, v3.16b, v5.16b\n"
       "cbz x20, 5f\n"
-      "ldr q6, [x22, #0x0]\n"
-      "ldr q0, [%x[Apanel], #0x0]\n"
-      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
-      "ldr q1, [%x[Apanel], #0x10]\n"
-      "ldr q7, [x22, #0x10]\n"
-      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
-      "ldr q2, [%x[Apanel], #0x20]\n"
-      "ldr q3, [%x[Apanel], #0x30]\n"
-      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
-      "ldr q4, [x22, #0x20]\n"
-      "ldr q5, [x22, #0x30]\n"
-      ".inst 0x4e87a431  // smmla v17.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      "ldr q1, [x22, #0x0]\n"
+      "ldr q7, [%x[Apanel], #0x0]\n"
+      ".inst 0x4e81a4e8  // smmla v8.4s, v7.16b, v1.16b\n"
+      "ldr q6, [%x[Apanel], #0x10]\n"
+      "ldr q0, [x22, #0x10]\n"
+      ".inst 0x4e80a4eb  // smmla v11.4s, v7.16b, v0.16b\n"
+      "ldr q5, [%x[Apanel], #0x20]\n"
+      "ldr q4, [%x[Apanel], #0x30]\n"
+      ".inst 0x4e81a4ce  // smmla v14.4s, v6.16b, v1.16b\n"
+      "ldr q3, [x22, #0x20]\n"
+      "ldr q2, [x22, #0x30]\n"
+      ".inst 0x4e80a4d1  // smmla v17.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e81a4b4  // smmla v20.4s, v5.16b, v1.16b\n"
+      ".inst 0x4e80a4b7  // smmla v23.4s, v5.16b, v0.16b\n"
       "add %x[Apanel], %x[Apanel], #0x40\n"
-      ".inst 0x4e86a47a  // smmla v26.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x22, #0x40]\n"
-      ".inst 0x4e87a47d  // smmla v29.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x22, #0x50]\n"
-      ".inst 0x4e84a409  // smmla v9.4s, v0.16b, v4.16b\n"
-      ".inst 0x4e85a40c  // smmla v12.4s, v0.16b, v5.16b\n"
-      ".inst 0x4e84a42f  // smmla v15.4s, v1.16b, v4.16b\n"
-      ".inst 0x4e85a432  // smmla v18.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e81a49a  // smmla v26.4s, v4.16b, v1.16b\n"
+      "ldr q1, [x22, #0x40]\n"
+      ".inst 0x4e80a49d  // smmla v29.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x22, #0x50]\n"
+      ".inst 0x4e83a4e9  // smmla v9.4s, v7.16b, v3.16b\n"
+      ".inst 0x4e82a4ec  // smmla v12.4s, v7.16b, v2.16b\n"
+      ".inst 0x4e83a4cf  // smmla v15.4s, v6.16b, v3.16b\n"
+      ".inst 0x4e82a4d2  // smmla v18.4s, v6.16b, v2.16b\n"
       "add x22, x22, #0x60\n"
-      ".inst 0x4e84a455  // smmla v21.4s, v2.16b, v4.16b\n"
-      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
-      ".inst 0x4e84a47b  // smmla v27.4s, v3.16b, v4.16b\n"
-      ".inst 0x4e85a47e  // smmla v30.4s, v3.16b, v5.16b\n"
-      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
-      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
-      ".inst 0x4e87a433  // smmla v19.4s, v1.16b, v7.16b\n"
-      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
-      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
-      ".inst 0x4e86a47c  // smmla v28.4s, v3.16b, v6.16b\n"
-      ".inst 0x4e87a47f  // smmla v31.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e83a4b5  // smmla v21.4s, v5.16b, v3.16b\n"
+      ".inst 0x4e82a4b8  // smmla v24.4s, v5.16b, v2.16b\n"
+      ".inst 0x4e83a49b  // smmla v27.4s, v4.16b, v3.16b\n"
+      ".inst 0x4e82a49e  // smmla v30.4s, v4.16b, v2.16b\n"
+      ".inst 0x4e81a4ea  // smmla v10.4s, v7.16b, v1.16b\n"
+      ".inst 0x4e80a4ed  // smmla v13.4s, v7.16b, v0.16b\n"
+      ".inst 0x4e81a4d0  // smmla v16.4s, v6.16b, v1.16b\n"
+      ".inst 0x4e80a4d3  // smmla v19.4s, v6.16b, v0.16b\n"
+      ".inst 0x4e81a4b6  // smmla v22.4s, v5.16b, v1.16b\n"
+      ".inst 0x4e80a4b9  // smmla v25.4s, v5.16b, v0.16b\n"
+      ".inst 0x4e81a49c  // smmla v28.4s, v4.16b, v1.16b\n"
+      ".inst 0x4e80a49f  // smmla v31.4s, v4.16b, v0.16b\n"
       "5:"  // multiply loop done
       "subs x23, x23, #0x1\n"
-      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp1 v0.2d, v8.2d, v11.2d\n"
       "uzp2 v8.2d, v8.2d, v11.2d\n"
-      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp1 v1.2d, v9.2d, v12.2d\n"
       "uzp2 v9.2d, v9.2d, v12.2d\n"
-      "str q4, [%x[Cpanel], #0x0]\n"
-      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "str q0, [%x[Cpanel], #0x0]\n"
+      "uzp1 v0.2d, v10.2d, v13.2d\n"
       "uzp2 v10.2d, v10.2d, v13.2d\n"
-      "str q11, [%x[Cpanel], #0x10]\n"
-      "str q12, [%x[Cpanel], #0x20]\n"
-      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "str q1, [%x[Cpanel], #0x10]\n"
+      "str q0, [%x[Cpanel], #0x20]\n"
+      "uzp1 v0.2d, v14.2d, v17.2d\n"
       "uzp2 v14.2d, v14.2d, v17.2d\n"
       "str q8, [%x[Cpanel], #0x30]\n"
-      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp1 v2.2d, v15.2d, v18.2d\n"
       "uzp2 v15.2d, v15.2d, v18.2d\n"
       "str q9, [%x[Cpanel], #0x40]\n"
-      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp1 v17.2d, v16.2d, v19.2d\n"
       "uzp2 v16.2d, v16.2d, v19.2d\n"
       "str q10, [%x[Cpanel], #0x50]\n"
-      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp1 v1.2d, v20.2d, v23.2d\n"
       "uzp2 v20.2d, v20.2d, v23.2d\n"
-      "str q13, [%x[Cpanel], #0x60]\n"
-      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "str q0, [%x[Cpanel], #0x60]\n"
+      "uzp1 v0.2d, v21.2d, v24.2d\n"
       "uzp2 v21.2d, v21.2d, v24.2d\n"
-      "str q17, [%x[Cpanel], #0x70]\n"
-      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "str q2, [%x[Cpanel], #0x70]\n"
+      "uzp1 v23.2d, v22.2d, v25.2d\n"
       "uzp2 v22.2d, v22.2d, v25.2d\n"
-      "str q18, [%x[Cpanel], #0x80]\n"
-      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "str q17, [%x[Cpanel], #0x80]\n"
+      "uzp1 v19.2d, v26.2d, v29.2d\n"
       "uzp2 v26.2d, v26.2d, v29.2d\n"
       "str q14, [%x[Cpanel], #0x90]\n"
-      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp1 v18.2d, v27.2d, v30.2d\n"
       "uzp2 v27.2d, v27.2d, v30.2d\n"
       "str q15, [%x[Cpanel], #0xa0]\n"
-      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp1 v17.2d, v28.2d, v31.2d\n"
       "uzp2 v28.2d, v28.2d, v31.2d\n"
       "str q16, [%x[Cpanel], #0xb0]\n"
-      "str q19, [%x[Cpanel], #0xc0]\n"
-      "str q23, [%x[Cpanel], #0xd0]\n"
-      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q1, [%x[Cpanel], #0xc0]\n"
+      "str q0, [%x[Cpanel], #0xd0]\n"
+      "str q23, [%x[Cpanel], #0xe0]\n"
       "str q20, [%x[Cpanel], #0xf0]\n"
       "str q21, [%x[Cpanel], #0x100]\n"
       "str q22, [%x[Cpanel], #0x110]\n"
-      "str q25, [%x[Cpanel], #0x120]\n"
-      "str q29, [%x[Cpanel], #0x130]\n"
-      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q19, [%x[Cpanel], #0x120]\n"
+      "str q18, [%x[Cpanel], #0x130]\n"
+      "str q17, [%x[Cpanel], #0x140]\n"
       "str q26, [%x[Cpanel], #0x150]\n"
       "str q27, [%x[Cpanel], #0x160]\n"
       "str q28, [%x[Cpanel], #0x170]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
index fa93c1d..0088557 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
+
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -56,11 +56,6 @@
         return 12;
     }
 
-    static unsigned int stripe_width()
-    {
-        return 4;
-    }
-
     static constexpr unsigned int k_unroll()
     {
         return 8;
@@ -116,5 +111,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
index 83301d8..54c5195 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void a64_interleaved_u8u32_mmla_8x12_a510(
-    const uint8_t *Apanel, const uint8_t *Bpanel,
-    uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const uint8_t *Apanel,
+    const uint8_t *Bpanel,
+    uint32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -43,7 +47,6 @@
     ka.bblocks = bblocks;
 
     __asm__ __volatile__(
-
       "1:"  // Height loop
       "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
       "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
@@ -82,28 +85,28 @@
       "movi v31.4s, #0x0\n"
       "blt 4f\n"
       "3:"  // main loop head
-      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
       ".inst 0x6e84a408  // ummla v8.4s, v0.16b, v4.16b\n"
       ".inst 0x6e85a40b  // ummla v11.4s, v0.16b, v5.16b\n"
-      "ldp q6, q7, [x22], #0x20\n"
+      "ldp q3, q7, [x22], #0x20\n"
       ".inst 0x6e84a42e  // ummla v14.4s, v1.16b, v4.16b\n"
       ".inst 0x6e85a431  // ummla v17.4s, v1.16b, v5.16b\n"
       ".inst 0x6e84a454  // ummla v20.4s, v2.16b, v4.16b\n"
       "sub x20, x20, #0x2\n"
       ".inst 0x6e85a457  // ummla v23.4s, v2.16b, v5.16b\n"
-      ".inst 0x6e84a47a  // ummla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e84a4da  // ummla v26.4s, v6.16b, v4.16b\n"
       "cmp x20, #0x2\n"
-      ".inst 0x6e85a47d  // ummla v29.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e85a4dd  // ummla v29.4s, v6.16b, v5.16b\n"
       "ldp q4, q5, [x22], #0x20\n"
-      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e83a409  // ummla v9.4s, v0.16b, v3.16b\n"
       ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e83a42f  // ummla v15.4s, v1.16b, v3.16b\n"
       ".inst 0x6e87a432  // ummla v18.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e83a455  // ummla v21.4s, v2.16b, v3.16b\n"
       ".inst 0x6e87a458  // ummla v24.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e86a47b  // ummla v27.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e87a47e  // ummla v30.4s, v3.16b, v7.16b\n"
-      "ldp q6, q7, [x22], #0x20\n"
+      ".inst 0x6e83a4db  // ummla v27.4s, v6.16b, v3.16b\n"
+      ".inst 0x6e87a4de  // ummla v30.4s, v6.16b, v7.16b\n"
+      "ldp q7, q3, [x22], #0x20\n"
       ".inst 0x6e84a40a  // ummla v10.4s, v0.16b, v4.16b\n"
       ".inst 0x6e85a40d  // ummla v13.4s, v0.16b, v5.16b\n"
       "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
@@ -113,39 +116,39 @@
       ".inst 0x6e84a456  // ummla v22.4s, v2.16b, v4.16b\n"
       ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
       "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
-      ".inst 0x6e84a47c  // ummla v28.4s, v3.16b, v4.16b\n"
-      ".inst 0x6e85a47f  // ummla v31.4s, v3.16b, v5.16b\n"
-      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
-      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e84a4dc  // ummla v28.4s, v6.16b, v4.16b\n"
+      ".inst 0x6e85a4df  // ummla v31.4s, v6.16b, v5.16b\n"
+      "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
       "ldp q4, q5, [x22], #0x20\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e87a431  // ummla v17.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e86a47a  // ummla v26.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e87a47d  // ummla v29.4s, v3.16b, v7.16b\n"
-      "ldp q6, q7, [x22], #0x20\n"
+      ".inst 0x6e83a40b  // ummla v11.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e87a42e  // ummla v14.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e83a431  // ummla v17.4s, v1.16b, v3.16b\n"
+      ".inst 0x6e87a454  // ummla v20.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e83a457  // ummla v23.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e87a4da  // ummla v26.4s, v6.16b, v7.16b\n"
+      ".inst 0x6e83a4dd  // ummla v29.4s, v6.16b, v3.16b\n"
+      "ldp q7, q3, [x22], #0x20\n"
       ".inst 0x6e84a409  // ummla v9.4s, v0.16b, v4.16b\n"
       ".inst 0x6e85a40c  // ummla v12.4s, v0.16b, v5.16b\n"
       ".inst 0x6e84a42f  // ummla v15.4s, v1.16b, v4.16b\n"
       ".inst 0x6e85a432  // ummla v18.4s, v1.16b, v5.16b\n"
       ".inst 0x6e84a455  // ummla v21.4s, v2.16b, v4.16b\n"
       ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
-      ".inst 0x6e84a47b  // ummla v27.4s, v3.16b, v4.16b\n"
-      ".inst 0x6e85a47e  // ummla v30.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e84a4db  // ummla v27.4s, v6.16b, v4.16b\n"
+      ".inst 0x6e85a4de  // ummla v30.4s, v6.16b, v5.16b\n"
       "ldp q4, q5, [x22], #0x20\n"
-      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e83a40d  // ummla v13.4s, v0.16b, v3.16b\n"
       "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
-      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e87a433  // ummla v19.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a430  // ummla v16.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e83a433  // ummla v19.4s, v1.16b, v3.16b\n"
       "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
-      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a456  // ummla v22.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e83a459  // ummla v25.4s, v2.16b, v3.16b\n"
       "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
-      ".inst 0x6e86a47c  // ummla v28.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e87a47f  // ummla v31.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4dc  // ummla v28.4s, v6.16b, v7.16b\n"
+      ".inst 0x6e83a4df  // ummla v31.4s, v6.16b, v3.16b\n"
       "bge 3b\n"
       "4:"  // main loop skip
       "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
@@ -158,7 +161,7 @@
       ".inst 0x6e85a457  // ummla v23.4s, v2.16b, v5.16b\n"
       ".inst 0x6e84a47a  // ummla v26.4s, v3.16b, v4.16b\n"
       ".inst 0x6e85a47d  // ummla v29.4s, v3.16b, v5.16b\n"
-      "ldp q4, q5, [x22], #0x20\n"
+      "ldp q5, q4, [x22], #0x20\n"
       ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
       ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
       ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
@@ -167,93 +170,93 @@
       ".inst 0x6e87a458  // ummla v24.4s, v2.16b, v7.16b\n"
       ".inst 0x6e86a47b  // ummla v27.4s, v3.16b, v6.16b\n"
       ".inst 0x6e87a47e  // ummla v30.4s, v3.16b, v7.16b\n"
-      ".inst 0x6e84a40a  // ummla v10.4s, v0.16b, v4.16b\n"
-      ".inst 0x6e85a40d  // ummla v13.4s, v0.16b, v5.16b\n"
-      ".inst 0x6e84a430  // ummla v16.4s, v1.16b, v4.16b\n"
-      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
-      ".inst 0x6e84a456  // ummla v22.4s, v2.16b, v4.16b\n"
-      ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
-      ".inst 0x6e84a47c  // ummla v28.4s, v3.16b, v4.16b\n"
-      ".inst 0x6e85a47f  // ummla v31.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e85a40a  // ummla v10.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e84a40d  // ummla v13.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e85a430  // ummla v16.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e84a433  // ummla v19.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a456  // ummla v22.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a459  // ummla v25.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a47c  // ummla v28.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e84a47f  // ummla v31.4s, v3.16b, v4.16b\n"
       "cbz x20, 5f\n"
-      "ldp q6, q7, [x22], #0x20\n"
-      "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
-      "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
-      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
-      "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
-      "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
-      "ldp q4, q5, [x22], #0x20\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e87a431  // ummla v17.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e86a47a  // ummla v26.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e87a47d  // ummla v29.4s, v3.16b, v7.16b\n"
-      "ldp q6, q7, [x22], #0x20\n"
-      ".inst 0x6e84a409  // ummla v9.4s, v0.16b, v4.16b\n"
-      ".inst 0x6e85a40c  // ummla v12.4s, v0.16b, v5.16b\n"
-      ".inst 0x6e84a42f  // ummla v15.4s, v1.16b, v4.16b\n"
-      ".inst 0x6e85a432  // ummla v18.4s, v1.16b, v5.16b\n"
-      ".inst 0x6e84a455  // ummla v21.4s, v2.16b, v4.16b\n"
-      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
-      ".inst 0x6e84a47b  // ummla v27.4s, v3.16b, v4.16b\n"
-      ".inst 0x6e85a47e  // ummla v30.4s, v3.16b, v5.16b\n"
-      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e87a433  // ummla v19.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e86a47c  // ummla v28.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e87a47f  // ummla v31.4s, v3.16b, v7.16b\n"
+      "ldp q1, q0, [x22], #0x20\n"
+      "ld1 { v7.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v6.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e81a4e8  // ummla v8.4s, v7.16b, v1.16b\n"
+      "ld1 { v5.16b }, [%x[Apanel]], #0x10\n"
+      "ld1 { v4.16b }, [%x[Apanel]], #0x10\n"
+      ".inst 0x6e80a4eb  // ummla v11.4s, v7.16b, v0.16b\n"
+      "ldp q3, q2, [x22], #0x20\n"
+      ".inst 0x6e81a4ce  // ummla v14.4s, v6.16b, v1.16b\n"
+      ".inst 0x6e80a4d1  // ummla v17.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e81a4b4  // ummla v20.4s, v5.16b, v1.16b\n"
+      ".inst 0x6e80a4b7  // ummla v23.4s, v5.16b, v0.16b\n"
+      ".inst 0x6e81a49a  // ummla v26.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e80a49d  // ummla v29.4s, v4.16b, v0.16b\n"
+      "ldp q1, q0, [x22], #0x20\n"
+      ".inst 0x6e83a4e9  // ummla v9.4s, v7.16b, v3.16b\n"
+      ".inst 0x6e82a4ec  // ummla v12.4s, v7.16b, v2.16b\n"
+      ".inst 0x6e83a4cf  // ummla v15.4s, v6.16b, v3.16b\n"
+      ".inst 0x6e82a4d2  // ummla v18.4s, v6.16b, v2.16b\n"
+      ".inst 0x6e83a4b5  // ummla v21.4s, v5.16b, v3.16b\n"
+      ".inst 0x6e82a4b8  // ummla v24.4s, v5.16b, v2.16b\n"
+      ".inst 0x6e83a49b  // ummla v27.4s, v4.16b, v3.16b\n"
+      ".inst 0x6e82a49e  // ummla v30.4s, v4.16b, v2.16b\n"
+      ".inst 0x6e81a4ea  // ummla v10.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e80a4ed  // ummla v13.4s, v7.16b, v0.16b\n"
+      ".inst 0x6e81a4d0  // ummla v16.4s, v6.16b, v1.16b\n"
+      ".inst 0x6e80a4d3  // ummla v19.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e81a4b6  // ummla v22.4s, v5.16b, v1.16b\n"
+      ".inst 0x6e80a4b9  // ummla v25.4s, v5.16b, v0.16b\n"
+      ".inst 0x6e81a49c  // ummla v28.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e80a49f  // ummla v31.4s, v4.16b, v0.16b\n"
       "5:"  // multiply loop done
       "subs x23, x23, #0x1\n"
-      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp1 v0.2d, v8.2d, v11.2d\n"
       "uzp2 v8.2d, v8.2d, v11.2d\n"
-      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp1 v1.2d, v9.2d, v12.2d\n"
       "uzp2 v9.2d, v9.2d, v12.2d\n"
-      "str q4, [%x[Cpanel], #0x0]\n"
-      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "str q0, [%x[Cpanel], #0x0]\n"
+      "uzp1 v0.2d, v10.2d, v13.2d\n"
       "uzp2 v10.2d, v10.2d, v13.2d\n"
-      "str q11, [%x[Cpanel], #0x10]\n"
-      "str q12, [%x[Cpanel], #0x20]\n"
-      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "str q1, [%x[Cpanel], #0x10]\n"
+      "str q0, [%x[Cpanel], #0x20]\n"
+      "uzp1 v0.2d, v14.2d, v17.2d\n"
       "uzp2 v14.2d, v14.2d, v17.2d\n"
       "str q8, [%x[Cpanel], #0x30]\n"
-      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp1 v2.2d, v15.2d, v18.2d\n"
       "uzp2 v15.2d, v15.2d, v18.2d\n"
       "str q9, [%x[Cpanel], #0x40]\n"
-      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp1 v17.2d, v16.2d, v19.2d\n"
       "uzp2 v16.2d, v16.2d, v19.2d\n"
       "str q10, [%x[Cpanel], #0x50]\n"
-      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp1 v1.2d, v20.2d, v23.2d\n"
       "uzp2 v20.2d, v20.2d, v23.2d\n"
-      "str q13, [%x[Cpanel], #0x60]\n"
-      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "str q0, [%x[Cpanel], #0x60]\n"
+      "uzp1 v0.2d, v21.2d, v24.2d\n"
       "uzp2 v21.2d, v21.2d, v24.2d\n"
-      "str q17, [%x[Cpanel], #0x70]\n"
-      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "str q2, [%x[Cpanel], #0x70]\n"
+      "uzp1 v23.2d, v22.2d, v25.2d\n"
       "uzp2 v22.2d, v22.2d, v25.2d\n"
-      "str q18, [%x[Cpanel], #0x80]\n"
-      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "str q17, [%x[Cpanel], #0x80]\n"
+      "uzp1 v19.2d, v26.2d, v29.2d\n"
       "uzp2 v26.2d, v26.2d, v29.2d\n"
       "str q14, [%x[Cpanel], #0x90]\n"
-      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp1 v18.2d, v27.2d, v30.2d\n"
       "uzp2 v27.2d, v27.2d, v30.2d\n"
       "str q15, [%x[Cpanel], #0xa0]\n"
-      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp1 v17.2d, v28.2d, v31.2d\n"
       "uzp2 v28.2d, v28.2d, v31.2d\n"
       "str q16, [%x[Cpanel], #0xb0]\n"
-      "str q19, [%x[Cpanel], #0xc0]\n"
-      "str q23, [%x[Cpanel], #0xd0]\n"
-      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q1, [%x[Cpanel], #0xc0]\n"
+      "str q0, [%x[Cpanel], #0xd0]\n"
+      "str q23, [%x[Cpanel], #0xe0]\n"
       "str q20, [%x[Cpanel], #0xf0]\n"
       "str q21, [%x[Cpanel], #0x100]\n"
       "str q22, [%x[Cpanel], #0x110]\n"
-      "str q25, [%x[Cpanel], #0x120]\n"
-      "str q29, [%x[Cpanel], #0x130]\n"
-      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q19, [%x[Cpanel], #0x120]\n"
+      "str q18, [%x[Cpanel], #0x130]\n"
+      "str q17, [%x[Cpanel], #0x140]\n"
       "str q26, [%x[Cpanel], #0x150]\n"
       "str q27, [%x[Cpanel], #0x160]\n"
       "str q28, [%x[Cpanel], #0x170]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
index c534219..30260b9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void a64_interleaved_u8u32_mmla_8x12(
-    const uint8_t *Apanel, const uint8_t *Bpanel,
-    uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const uint8_t *Apanel,
+    const uint8_t *Bpanel,
+    uint32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -43,7 +47,6 @@
     ka.bblocks = bblocks;
 
     __asm__ __volatile__(
-
       "1:"  // Height loop
       "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n"
       "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
@@ -85,31 +88,31 @@
       "movi v31.4s, #0x0\n"
       "blt 4f\n"
       "3:"  // main loop head
-      "ldr q3, [%x[Apanel], #0x0]\n"
-      "ldr q6, [x22, #0x0]\n"
+      "ldr q6, [%x[Apanel], #0x0]\n"
+      "ldr q7, [x22, #0x0]\n"
       ".inst 0x6e84a408  // ummla v8.4s, v0.16b, v4.16b\n"
-      "ldr q7, [x22, #0x10]\n"
+      "ldr q3, [x22, #0x10]\n"
       ".inst 0x6e85a40b  // ummla v11.4s, v0.16b, v5.16b\n"
       ".inst 0x6e84a42e  // ummla v14.4s, v1.16b, v4.16b\n"
       ".inst 0x6e85a431  // ummla v17.4s, v1.16b, v5.16b\n"
       ".inst 0x6e84a454  // ummla v20.4s, v2.16b, v4.16b\n"
       "sub x20, x20, #0x2\n"
       ".inst 0x6e85a457  // ummla v23.4s, v2.16b, v5.16b\n"
-      ".inst 0x6e84a47a  // ummla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e84a4da  // ummla v26.4s, v6.16b, v4.16b\n"
       "ldr q4, [x22, #0x20]\n"
-      ".inst 0x6e85a47d  // ummla v29.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e85a4dd  // ummla v29.4s, v6.16b, v5.16b\n"
       "ldr q5, [x22, #0x30]\n"
-      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e83a40c  // ummla v12.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e87a42f  // ummla v15.4s, v1.16b, v7.16b\n"
       "cmp x20, #0x2\n"
-      ".inst 0x6e87a432  // ummla v18.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a458  // ummla v24.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e86a47b  // ummla v27.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x22, #0x40]\n"
-      ".inst 0x6e87a47e  // ummla v30.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x22, #0x50]\n"
+      ".inst 0x6e83a432  // ummla v18.4s, v1.16b, v3.16b\n"
+      ".inst 0x6e87a455  // ummla v21.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e83a458  // ummla v24.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e87a4db  // ummla v27.4s, v6.16b, v7.16b\n"
+      "ldr q7, [x22, #0x40]\n"
+      ".inst 0x6e83a4de  // ummla v30.4s, v6.16b, v3.16b\n"
+      "ldr q3, [x22, #0x50]\n"
       ".inst 0x6e84a40a  // ummla v10.4s, v0.16b, v4.16b\n"
       ".inst 0x6e85a40d  // ummla v13.4s, v0.16b, v5.16b\n"
       "ldr q0, [%x[Apanel], #0x10]\n"
@@ -119,42 +122,42 @@
       ".inst 0x6e84a456  // ummla v22.4s, v2.16b, v4.16b\n"
       ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
       "ldr q2, [%x[Apanel], #0x30]\n"
-      ".inst 0x6e84a47c  // ummla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e84a4dc  // ummla v28.4s, v6.16b, v4.16b\n"
       "ldr q4, [x22, #0x60]\n"
-      ".inst 0x6e85a47f  // ummla v31.4s, v3.16b, v5.16b\n"
-      "ldr q3, [%x[Apanel], #0x40]\n"
+      ".inst 0x6e85a4df  // ummla v31.4s, v6.16b, v5.16b\n"
+      "ldr q6, [%x[Apanel], #0x40]\n"
       "ldr q5, [x22, #0x70]\n"
-      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e87a431  // ummla v17.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e86a47a  // ummla v26.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x22, #0x80]\n"
-      ".inst 0x6e87a47d  // ummla v29.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x22, #0x90]\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e83a40b  // ummla v11.4s, v0.16b, v3.16b\n"
+      ".inst 0x6e87a42e  // ummla v14.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e83a431  // ummla v17.4s, v1.16b, v3.16b\n"
+      ".inst 0x6e87a454  // ummla v20.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e83a457  // ummla v23.4s, v2.16b, v3.16b\n"
+      ".inst 0x6e87a4da  // ummla v26.4s, v6.16b, v7.16b\n"
+      "ldr q7, [x22, #0x80]\n"
+      ".inst 0x6e83a4dd  // ummla v29.4s, v6.16b, v3.16b\n"
+      "ldr q3, [x22, #0x90]\n"
       ".inst 0x6e84a409  // ummla v9.4s, v0.16b, v4.16b\n"
       ".inst 0x6e85a40c  // ummla v12.4s, v0.16b, v5.16b\n"
       ".inst 0x6e84a42f  // ummla v15.4s, v1.16b, v4.16b\n"
       ".inst 0x6e85a432  // ummla v18.4s, v1.16b, v5.16b\n"
       ".inst 0x6e84a455  // ummla v21.4s, v2.16b, v4.16b\n"
       ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
-      ".inst 0x6e84a47b  // ummla v27.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e84a4db  // ummla v27.4s, v6.16b, v4.16b\n"
       "ldr q4, [x22, #0xa0]\n"
-      ".inst 0x6e85a47e  // ummla v30.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e85a4de  // ummla v30.4s, v6.16b, v5.16b\n"
       "ldr q5, [x22, #0xb0]\n"
-      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e83a40d  // ummla v13.4s, v0.16b, v3.16b\n"
       "ldr q0, [%x[Apanel], #0x50]\n"
-      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e87a433  // ummla v19.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a430  // ummla v16.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e83a433  // ummla v19.4s, v1.16b, v3.16b\n"
       "ldr q1, [%x[Apanel], #0x60]\n"
-      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a456  // ummla v22.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e83a459  // ummla v25.4s, v2.16b, v3.16b\n"
       "ldr q2, [%x[Apanel], #0x70]\n"
-      ".inst 0x6e86a47c  // ummla v28.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e87a47f  // ummla v31.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4dc  // ummla v28.4s, v6.16b, v7.16b\n"
+      ".inst 0x6e83a4df  // ummla v31.4s, v6.16b, v3.16b\n"
       "add %x[Apanel], %x[Apanel], #0x80\n"
       "add x22, x22, #0xc0\n"
       "bge 3b\n"
@@ -191,89 +194,89 @@
       ".inst 0x6e84a47c  // ummla v28.4s, v3.16b, v4.16b\n"
       ".inst 0x6e85a47f  // ummla v31.4s, v3.16b, v5.16b\n"
       "cbz x20, 5f\n"
-      "ldr q6, [x22, #0x0]\n"
-      "ldr q0, [%x[Apanel], #0x0]\n"
-      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
-      "ldr q1, [%x[Apanel], #0x10]\n"
-      "ldr q7, [x22, #0x10]\n"
-      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
-      "ldr q2, [%x[Apanel], #0x20]\n"
-      "ldr q3, [%x[Apanel], #0x30]\n"
-      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
-      "ldr q4, [x22, #0x20]\n"
-      "ldr q5, [x22, #0x30]\n"
-      ".inst 0x6e87a431  // ummla v17.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
+      "ldr q1, [x22, #0x0]\n"
+      "ldr q7, [%x[Apanel], #0x0]\n"
+      ".inst 0x6e81a4e8  // ummla v8.4s, v7.16b, v1.16b\n"
+      "ldr q6, [%x[Apanel], #0x10]\n"
+      "ldr q0, [x22, #0x10]\n"
+      ".inst 0x6e80a4eb  // ummla v11.4s, v7.16b, v0.16b\n"
+      "ldr q5, [%x[Apanel], #0x20]\n"
+      "ldr q4, [%x[Apanel], #0x30]\n"
+      ".inst 0x6e81a4ce  // ummla v14.4s, v6.16b, v1.16b\n"
+      "ldr q3, [x22, #0x20]\n"
+      "ldr q2, [x22, #0x30]\n"
+      ".inst 0x6e80a4d1  // ummla v17.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e81a4b4  // ummla v20.4s, v5.16b, v1.16b\n"
+      ".inst 0x6e80a4b7  // ummla v23.4s, v5.16b, v0.16b\n"
       "add %x[Apanel], %x[Apanel], #0x40\n"
-      ".inst 0x6e86a47a  // ummla v26.4s, v3.16b, v6.16b\n"
-      "ldr q6, [x22, #0x40]\n"
-      ".inst 0x6e87a47d  // ummla v29.4s, v3.16b, v7.16b\n"
-      "ldr q7, [x22, #0x50]\n"
-      ".inst 0x6e84a409  // ummla v9.4s, v0.16b, v4.16b\n"
-      ".inst 0x6e85a40c  // ummla v12.4s, v0.16b, v5.16b\n"
-      ".inst 0x6e84a42f  // ummla v15.4s, v1.16b, v4.16b\n"
-      ".inst 0x6e85a432  // ummla v18.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e81a49a  // ummla v26.4s, v4.16b, v1.16b\n"
+      "ldr q1, [x22, #0x40]\n"
+      ".inst 0x6e80a49d  // ummla v29.4s, v4.16b, v0.16b\n"
+      "ldr q0, [x22, #0x50]\n"
+      ".inst 0x6e83a4e9  // ummla v9.4s, v7.16b, v3.16b\n"
+      ".inst 0x6e82a4ec  // ummla v12.4s, v7.16b, v2.16b\n"
+      ".inst 0x6e83a4cf  // ummla v15.4s, v6.16b, v3.16b\n"
+      ".inst 0x6e82a4d2  // ummla v18.4s, v6.16b, v2.16b\n"
       "add x22, x22, #0x60\n"
-      ".inst 0x6e84a455  // ummla v21.4s, v2.16b, v4.16b\n"
-      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
-      ".inst 0x6e84a47b  // ummla v27.4s, v3.16b, v4.16b\n"
-      ".inst 0x6e85a47e  // ummla v30.4s, v3.16b, v5.16b\n"
-      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
-      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
-      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
-      ".inst 0x6e87a433  // ummla v19.4s, v1.16b, v7.16b\n"
-      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
-      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
-      ".inst 0x6e86a47c  // ummla v28.4s, v3.16b, v6.16b\n"
-      ".inst 0x6e87a47f  // ummla v31.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e83a4b5  // ummla v21.4s, v5.16b, v3.16b\n"
+      ".inst 0x6e82a4b8  // ummla v24.4s, v5.16b, v2.16b\n"
+      ".inst 0x6e83a49b  // ummla v27.4s, v4.16b, v3.16b\n"
+      ".inst 0x6e82a49e  // ummla v30.4s, v4.16b, v2.16b\n"
+      ".inst 0x6e81a4ea  // ummla v10.4s, v7.16b, v1.16b\n"
+      ".inst 0x6e80a4ed  // ummla v13.4s, v7.16b, v0.16b\n"
+      ".inst 0x6e81a4d0  // ummla v16.4s, v6.16b, v1.16b\n"
+      ".inst 0x6e80a4d3  // ummla v19.4s, v6.16b, v0.16b\n"
+      ".inst 0x6e81a4b6  // ummla v22.4s, v5.16b, v1.16b\n"
+      ".inst 0x6e80a4b9  // ummla v25.4s, v5.16b, v0.16b\n"
+      ".inst 0x6e81a49c  // ummla v28.4s, v4.16b, v1.16b\n"
+      ".inst 0x6e80a49f  // ummla v31.4s, v4.16b, v0.16b\n"
       "5:"  // multiply loop done
       "subs x23, x23, #0x1\n"
-      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp1 v0.2d, v8.2d, v11.2d\n"
       "uzp2 v8.2d, v8.2d, v11.2d\n"
-      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp1 v1.2d, v9.2d, v12.2d\n"
       "uzp2 v9.2d, v9.2d, v12.2d\n"
-      "str q4, [%x[Cpanel], #0x0]\n"
-      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "str q0, [%x[Cpanel], #0x0]\n"
+      "uzp1 v0.2d, v10.2d, v13.2d\n"
       "uzp2 v10.2d, v10.2d, v13.2d\n"
-      "str q11, [%x[Cpanel], #0x10]\n"
-      "str q12, [%x[Cpanel], #0x20]\n"
-      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "str q1, [%x[Cpanel], #0x10]\n"
+      "str q0, [%x[Cpanel], #0x20]\n"
+      "uzp1 v0.2d, v14.2d, v17.2d\n"
       "uzp2 v14.2d, v14.2d, v17.2d\n"
       "str q8, [%x[Cpanel], #0x30]\n"
-      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp1 v2.2d, v15.2d, v18.2d\n"
       "uzp2 v15.2d, v15.2d, v18.2d\n"
       "str q9, [%x[Cpanel], #0x40]\n"
-      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp1 v17.2d, v16.2d, v19.2d\n"
       "uzp2 v16.2d, v16.2d, v19.2d\n"
       "str q10, [%x[Cpanel], #0x50]\n"
-      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp1 v1.2d, v20.2d, v23.2d\n"
       "uzp2 v20.2d, v20.2d, v23.2d\n"
-      "str q13, [%x[Cpanel], #0x60]\n"
-      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "str q0, [%x[Cpanel], #0x60]\n"
+      "uzp1 v0.2d, v21.2d, v24.2d\n"
       "uzp2 v21.2d, v21.2d, v24.2d\n"
-      "str q17, [%x[Cpanel], #0x70]\n"
-      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "str q2, [%x[Cpanel], #0x70]\n"
+      "uzp1 v23.2d, v22.2d, v25.2d\n"
       "uzp2 v22.2d, v22.2d, v25.2d\n"
-      "str q18, [%x[Cpanel], #0x80]\n"
-      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "str q17, [%x[Cpanel], #0x80]\n"
+      "uzp1 v19.2d, v26.2d, v29.2d\n"
       "uzp2 v26.2d, v26.2d, v29.2d\n"
       "str q14, [%x[Cpanel], #0x90]\n"
-      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp1 v18.2d, v27.2d, v30.2d\n"
       "uzp2 v27.2d, v27.2d, v30.2d\n"
       "str q15, [%x[Cpanel], #0xa0]\n"
-      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp1 v17.2d, v28.2d, v31.2d\n"
       "uzp2 v28.2d, v28.2d, v31.2d\n"
       "str q16, [%x[Cpanel], #0xb0]\n"
-      "str q19, [%x[Cpanel], #0xc0]\n"
-      "str q23, [%x[Cpanel], #0xd0]\n"
-      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q1, [%x[Cpanel], #0xc0]\n"
+      "str q0, [%x[Cpanel], #0xd0]\n"
+      "str q23, [%x[Cpanel], #0xe0]\n"
       "str q20, [%x[Cpanel], #0xf0]\n"
       "str q21, [%x[Cpanel], #0x100]\n"
       "str q22, [%x[Cpanel], #0x110]\n"
-      "str q25, [%x[Cpanel], #0x120]\n"
-      "str q29, [%x[Cpanel], #0x130]\n"
-      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q19, [%x[Cpanel], #0x120]\n"
+      "str q18, [%x[Cpanel], #0x130]\n"
+      "str q17, [%x[Cpanel], #0x140]\n"
       "str q26, [%x[Cpanel], #0x150]\n"
       "str q27, [%x[Cpanel], #0x160]\n"
       "str q28, [%x[Cpanel], #0x170]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp
index f86bceb..76f43f0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,19 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
-#ifdef __aarch64__
+#if defined(ARM_COMPUTE_ENABLE_SME2)
 #include "../std_transforms_sme.hpp"
 #include "../bfloat.hpp"
 
@@ -84,4 +83,4 @@
 
 #undef ARGLIST
 
-#endif // __aarch64__
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
index 520eeed..db29e42 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
-#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
 
 #include "arm_gemm.hpp"
 #include "../../utils.hpp"
@@ -62,7 +62,7 @@
             break;
     }
     __asm__ __volatile__(
-      "ptrue p1.b\n"
+      "ptrue p8.b\n"
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntw x28, ALL, MUL #4\n"
       "add x27, %x[N], x28\n"
@@ -102,311 +102,311 @@
       "bgt 20f\n"
       "beq 12f\n"
       "mov x23, %x[A_ptr]\n"
-      "lsl x22, %x[K], #0x1\n"
+      "lsl x21, %x[K], #0x1\n"
       "mov x20, %x[N]\n"
-      "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
       "cbz x24, 5f\n"
-      ".inst 0xa040c700  // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
-      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      ".inst 0xa040c718  // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042f00  // mova za.d[x9, #0], { z24.d-z27.d }\n"
       "b 6f\n"
       "5:"  // Width 1: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "6:"  // Width 1: setup done
-      "cmp x21, #0x8\n"
+      "cmp x22, #0x8\n"
       "ble 8f\n"
       "7:"  // Width 1: Multiply loop: Main loop head
-      "whilelt p0.h, XZR, x21\n"
-      "ld1rqh { z10.h }, p0/Z, [x23]\n"
-      "sub x21, x21, #0x8\n"
-      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z8.h }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x8\n"
+      ".inst 0xa040a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158b298  // bfdot za.s[x9, 0], { z20.h-z23.h }, z8.h[0]\n"
       "addvl x26, x26, #16\n"
-      "cmp x21, #0x8\n"
-      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+      "cmp x22, #0x8\n"
+      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158b498  // bfdot za.s[x9, 0], { z4.h-z7.h }, z8.h[1]\n"
       "addvl x26, x26, #16\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158bb98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z8.h[2]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158bf18  // bfdot za.s[x9, 0], { z24.h-z27.h }, z8.h[3]\n"
       "addvl x26, x26, #16\n"
       "bgt 7b\n"
       "8:"  // Width 1: Multiply loop: Single iteration only
-      "whilelt p0.h, XZR, x21\n"
-      "ld1rqh { z10.h }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z11.h }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
-      "addvl x26, x26, #16\n"
-      "ble 9f\n"
-      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+      ".inst 0xc15bb398  // bfdot za.s[x9, 0], { z28.h-z31.h }, z11.h[0]\n"
       "addvl x26, x26, #16\n"
       "ble 9f\n"
       ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bb598  // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[1]\n"
       "addvl x26, x26, #16\n"
       "ble 9f\n"
       ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15bbc18  // bfdot za.s[x9, 0], { z0.h-z3.h }, z11.h[3]\n"
       "addvl x26, x26, #16\n"
       "9:"  // Width 1: Multiply loop: multiply skip
       "tbz %x[flags], #1, 10f\n"
       "add x21, %x[args_ptr], %[offset_min]\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
       ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      "ld1rw { z0.s }, p1/Z, [x21]\n"
-      "ld1rw { z6.s }, p1/Z, [x20]\n"
-      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+      "ld1rw { z3.s }, p1/Z, [x21]\n"
+      "ld1rw { z29.s }, p1/Z, [x20]\n"
+      ".inst 0xc1bdc868  // fclamp { z8.s-z11.s }, z3.s, z29.s\n"
       ".inst 0xa060c328  // st1w { z8.s-z11.s }, p8, [x25]\n"
       "addvl x25, x25, #4\n"
       "b 11f\n"
       "10:"  // Width 1: No activation
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      ".inst 0xa060c328  // st1w { z8.s-z11.s }, p8, [x25]\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c32c  // st1w { z12.s-z15.s }, p8, [x25]\n"
       "addvl x25, x25, #4\n"
       "11:"  // Width 1: Output done
       "b 36f\n"
       "12:"  // Width 2
       "mov x23, %x[A_ptr]\n"
-      "lsl x22, %x[K], #0x1\n"
+      "lsl x21, %x[K], #0x1\n"
       "sub x20, %x[N], x28\n"
-      "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
       "cbz x24, 13f\n"
-      ".inst 0xa040c700  // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
-      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
-      ".inst 0xa041c708  // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
-      ".inst 0xc0042d01  // mova za.d[x9, #1], { z8.d-z11.d }\n"
+      ".inst 0xa040c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa041c714  // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
       "b 14f\n"
       "13:"  // Width 2: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "14:"  // Width 2: setup done
-      "cmp x21, #0x8\n"
+      "cmp x22, #0x8\n"
       "ble 16f\n"
       "15:"  // Width 2: Multiply loop: Main loop head
-      "whilelt p0.h, XZR, x21\n"
-      "ld1rqh { z10.h }, p0/Z, [x23]\n"
-      "sub x21, x21, #0x8\n"
-      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
-      "cmp x21, #0x8\n"
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z9.h }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x8\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc159b198  // bfdot za.s[x9, 0], { z12.h-z15.h }, z9.h[0]\n"
+      "cmp x22, #0x8\n"
       "add x23, x23, #0x10\n"
       ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15ab099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
-      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15ab619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+      ".inst 0xc159b099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z9.h[0]\n"
       "addvl x26, x26, #16\n"
       ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
-      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+      ".inst 0xc159b598  // bfdot za.s[x9, 0], { z12.h-z15.h }, z9.h[1]\n"
+      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc159b499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z9.h[1]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
-      ".inst 0xa041a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15abf19  // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc159bb18  // bfdot za.s[x9, 0], { z24.h-z27.h }, z9.h[2]\n"
+      ".inst 0xa041a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc159b819  // bfdot za.s[x9, 1], { z0.h-z3.h }, z9.h[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc159bc18  // bfdot za.s[x9, 0], { z0.h-z3.h }, z9.h[3]\n"
+      ".inst 0xa041a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc159bf99  // bfdot za.s[x9, 1], { z28.h-z31.h }, z9.h[3]\n"
       "addvl x26, x26, #16\n"
       "bgt 15b\n"
       "16:"  // Width 2: Multiply loop: Single iteration only
-      "whilelt p0.h, XZR, x21\n"
-      "ld1rqh { z10.h }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z11.h }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
-      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15ab099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+      ".inst 0xc15bb198  // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[0]\n"
+      ".inst 0xa041a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bb019  // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[0]\n"
       "addvl x26, x26, #16\n"
       "ble 17f\n"
       ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
-      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15ab619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bb718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z11.h[1]\n"
+      ".inst 0xa041a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bb419  // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[1]\n"
       "addvl x26, x26, #16\n"
       "ble 17f\n"
       ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
-      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bb998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[2]\n"
+      ".inst 0xa041a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bbb99  // bfdot za.s[x9, 1], { z28.h-z31.h }, z11.h[2]\n"
       "addvl x26, x26, #16\n"
       "ble 17f\n"
       ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
-      ".inst 0xa041a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15abf19  // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+      ".inst 0xc15bbe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[3]\n"
+      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bbe99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[3]\n"
       "addvl x26, x26, #16\n"
       "17:"  // Width 2: Multiply loop: multiply skip
       "tbz %x[flags], #1, 18f\n"
       "add x21, %x[args_ptr], %[offset_min]\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      "ld1rw { z0.s }, p1/Z, [x21]\n"
-      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
-      "ld1rw { z6.s }, p1/Z, [x20]\n"
-      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc1a6c814  // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
-      ".inst 0xa061c334  // st1w { z20.s-z23.s }, p8, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c00  // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+      "ld1rw { z9.s }, p1/Z, [x21]\n"
+      ".inst 0xc0062c24  // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+      "ld1rw { z8.s }, p1/Z, [x20]\n"
+      ".inst 0xc1a8c920  // fclamp { z0.s-z3.s }, z9.s, z8.s\n"
+      ".inst 0xa060c720  // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+      ".inst 0xc1a8c924  // fclamp { z4.s-z7.s }, z9.s, z8.s\n"
+      ".inst 0xa061c324  // st1w { z4.s-z7.s }, p8, [x25, #0x4, MUL VL]\n"
       "addvl x25, x25, #8\n"
       "b 19f\n"
       "18:"  // Width 2: No activation
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
-      ".inst 0xa061c334  // st1w { z20.s-z23.s }, p8, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c10  // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c730  // st1w { z16.s-z19.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n"
       "addvl x25, x25, #8\n"
       "19:"  // Width 2: Output done
       "b 36f\n"
       "20:"  // Width 3
       "mov x20, #0x2\n"
       "mov x23, %x[A_ptr]\n"
-      "lsl x22, %x[K], #0x1\n"
+      "lsl x21, %x[K], #0x1\n"
       "msub x20, x28, x20, %x[N]\n"
-      "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
       "cbz x24, 21f\n"
-      ".inst 0xa040c700  // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
-      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
-      ".inst 0xa041c708  // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
-      ".inst 0xc0042d01  // mova za.d[x9, #1], { z8.d-z11.d }\n"
-      ".inst 0xa042c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
-      ".inst 0xc0042c82  // mova za.d[x9, #2], { z4.d-z7.d }\n"
+      ".inst 0xa040c718  // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042f00  // mova za.d[x9, #0], { z24.d-z27.d }\n"
+      ".inst 0xa041c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042c81  // mova za.d[x9, #1], { z4.d-z7.d }\n"
+      ".inst 0xa042c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042e02  // mova za.d[x9, #2], { z16.d-z19.d }\n"
       "b 22f\n"
       "21:"  // Width 3: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "22:"  // Width 3: setup done
-      "cmp x21, #0x8\n"
+      "cmp x22, #0x8\n"
       "ble 24f\n"
       "23:"  // Width 3: Multiply loop: Main loop head
-      "whilelt p0.h, XZR, x21\n"
-      "ld1rqh { z10.h }, p0/Z, [x23]\n"
-      "sub x21, x21, #0x8\n"
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z15.h }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x8\n"
       ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
-      "cmp x21, #0x8\n"
+      ".inst 0xc15fb018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z15.h[0]\n"
+      "cmp x22, #0x8\n"
       "add x23, x23, #0x10\n"
       ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15ab099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
-      ".inst 0xa042a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15ab29a  // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
+      ".inst 0xc15fb099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[0]\n"
+      ".inst 0xa042a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fb01a  // bfdot za.s[x9, 2], { z0.h-z3.h }, z15.h[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fb698  // bfdot za.s[x9, 0], { z20.h-z23.h }, z15.h[1]\n"
+      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb699  // bfdot za.s[x9, 1], { z20.h-z23.h }, z15.h[1]\n"
+      ".inst 0xa042a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fb51a  // bfdot za.s[x9, 2], { z8.h-z11.h }, z15.h[1]\n"
       "addvl x26, x26, #16\n"
       ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
-      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15ab619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
-      ".inst 0xa042a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15ab71a  // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
-      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
-      ".inst 0xa042a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15abb9a  // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
+      ".inst 0xc15fbb18  // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[2]\n"
+      ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb919  // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[2]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[2]\n"
       "addvl x26, x26, #16\n"
       ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
-      ".inst 0xa041a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15abf19  // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
-      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15abe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
+      ".inst 0xc15fbe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[3]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fbe19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[3]\n"
+      ".inst 0xa042a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fbd1a  // bfdot za.s[x9, 2], { z8.h-z11.h }, z15.h[3]\n"
       "addvl x26, x26, #16\n"
       "bgt 23b\n"
       "24:"  // Width 3: Multiply loop: Single iteration only
-      "whilelt p0.h, XZR, x21\n"
-      "ld1rqh { z10.h }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z11.h }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
-      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15ab099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+      ".inst 0xc15bb398  // bfdot za.s[x9, 0], { z28.h-z31.h }, z11.h[0]\n"
+      ".inst 0xa041a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bb019  // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[0]\n"
       ".inst 0xa042a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15ab29a  // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
-      "addvl x26, x26, #16\n"
-      "ble 25f\n"
-      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
-      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15ab619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
-      ".inst 0xa042a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15ab71a  // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
+      ".inst 0xc15bb29a  // bfdot za.s[x9, 2], { z20.h-z23.h }, z11.h[0]\n"
       "addvl x26, x26, #16\n"
       "ble 25f\n"
       ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
-      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bb598  // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[1]\n"
+      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bb499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z11.h[1]\n"
       ".inst 0xa042a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15abb9a  // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
+      ".inst 0xc15bb79a  // bfdot za.s[x9, 2], { z28.h-z31.h }, z11.h[1]\n"
       "addvl x26, x26, #16\n"
       "ble 25f\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
-      ".inst 0xa041a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15abf19  // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bb898  // bfdot za.s[x9, 0], { z4.h-z7.h }, z11.h[2]\n"
+      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bba99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[2]\n"
+      ".inst 0xa042a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15bb99a  // bfdot za.s[x9, 2], { z12.h-z15.h }, z11.h[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15bbd98  // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[3]\n"
+      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bbe99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[3]\n"
       ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15abe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
+      ".inst 0xc15bbe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[3]\n"
       "addvl x26, x26, #16\n"
       "25:"  // Width 3: Multiply loop: multiply skip
       "tbz %x[flags], #1, 26f\n"
       "add x21, %x[args_ptr], %[offset_min]\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      "ld1rw { z0.s }, p1/Z, [x21]\n"
-      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
-      "ld1rw { z6.s }, p1/Z, [x20]\n"
-      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
-      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc1a6c814  // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
-      ".inst 0xa061c734  // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
-      ".inst 0xc1a6c810  // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
-      ".inst 0xa062c330  // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc0062c04  // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+      "ld1rw { z17.s }, p1/Z, [x21]\n"
+      ".inst 0xc0062c28  // mova { z8.d-z11.d }, za.d[x9, #1]\n"
+      "ld1rw { z16.s }, p1/Z, [x20]\n"
+      ".inst 0xc1b0ca24  // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c724  // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
+      ".inst 0xc1b0ca28  // fclamp { z8.s-z11.s }, z17.s, z16.s\n"
+      ".inst 0xa061c728  // st1w { z8.s-z11.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc1b0ca2c  // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+      ".inst 0xa062c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
       "addvl x25, x25, #12\n"
       "b 27f\n"
       "26:"  // Width 3: No activation
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
-      ".inst 0xa061c734  // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
-      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
-      ".inst 0xa062c330  // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc0062c14  // mova { z20.d-z23.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c734  // st1w { z20.s-z23.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c72c  // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
       "addvl x25, x25, #12\n"
       "27:"  // Width 3: Output done
       "b 36f\n"
       "28:"  // Width 4
       "mov x20, #0x3\n"
       "mov x23, %x[A_ptr]\n"
-      "lsl x22, %x[K], #0x1\n"
+      "lsl x21, %x[K], #0x1\n"
       "msub x20, x28, x20, %x[N]\n"
-      "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
       "cbz x24, 29f\n"
-      ".inst 0xa040c700  // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
-      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
-      ".inst 0xa041c708  // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
-      ".inst 0xc0042d01  // mova za.d[x9, #1], { z8.d-z11.d }\n"
-      ".inst 0xa042c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
-      ".inst 0xc0042c82  // mova za.d[x9, #2], { z4.d-z7.d }\n"
+      ".inst 0xa040c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa041c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
+      ".inst 0xa042c70c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042d82  // mova za.d[x9, #2], { z12.d-z15.d }\n"
       ".inst 0xa043c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
       ".inst 0xc0042e03  // mova za.d[x9, #3], { z16.d-z19.d }\n"
       "addvl x24, x24, #16\n"
@@ -414,126 +414,126 @@
       "29:"  // Width 4: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "30:"  // Width 4: setup done
-      "cmp x21, #0x8\n"
+      "cmp x22, #0x8\n"
       "ble 32f\n"
       "31:"  // Width 4: Multiply loop: Main loop head
-      "whilelt p0.h, XZR, x21\n"
-      "ld1rqh { z10.h }, p0/Z, [x23]\n"
-      "sub x21, x21, #0x8\n"
-      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
-      "cmp x21, #0x8\n"
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z8.h }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x8\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158b218  // bfdot za.s[x9, 0], { z16.h-z19.h }, z8.h[0]\n"
+      "cmp x22, #0x8\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15ab099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
-      ".inst 0xa042a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15ab29a  // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
-      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15ab21b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[0]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
-      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15ab619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
-      ".inst 0xa042a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15ab71a  // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
-      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15ab61b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[1]\n"
+      ".inst 0xa041a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158b199  // bfdot za.s[x9, 1], { z12.h-z15.h }, z8.h[0]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc158b21a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[0]\n"
+      ".inst 0xa043a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158b19b  // bfdot za.s[x9, 3], { z12.h-z15.h }, z8.h[0]\n"
       "addvl x26, x26, #16\n"
       ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
-      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
-      ".inst 0xa042a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15abb9a  // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
-      ".inst 0xa043a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15aba9b  // bfdot za.s[x9, 3], { z20.h-z23.h }, z10.h[2]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
-      ".inst 0xa041a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15abf19  // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+      ".inst 0xc158b598  // bfdot za.s[x9, 0], { z12.h-z15.h }, z8.h[1]\n"
+      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158b699  // bfdot za.s[x9, 1], { z20.h-z23.h }, z8.h[1]\n"
       ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15abe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
-      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15abe1b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[3]\n"
+      ".inst 0xc158b61a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[1]\n"
+      ".inst 0xa043a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158b69b  // bfdot za.s[x9, 3], { z20.h-z23.h }, z8.h[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158b898  // bfdot za.s[x9, 0], { z4.h-z7.h }, z8.h[2]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158ba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z8.h[2]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc158ba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[2]\n"
+      ".inst 0xa043a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158b81b  // bfdot za.s[x9, 3], { z0.h-z3.h }, z8.h[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158be98  // bfdot za.s[x9, 0], { z20.h-z23.h }, z8.h[3]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158be19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z8.h[3]\n"
+      ".inst 0xa042a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc158bc9a  // bfdot za.s[x9, 2], { z4.h-z7.h }, z8.h[3]\n"
+      ".inst 0xa043a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158be9b  // bfdot za.s[x9, 3], { z20.h-z23.h }, z8.h[3]\n"
       "addvl x26, x26, #16\n"
       "bgt 31b\n"
       "32:"  // Width 4: Multiply loop: Single iteration only
-      "whilelt p0.h, XZR, x21\n"
-      "ld1rqh { z10.h }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      "whilelt p0.h, XZR, x22\n"
+      "ld1rqh { z11.h }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
-      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15ab099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
-      ".inst 0xa042a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15ab29a  // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
-      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15ab21b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[0]\n"
-      "addvl x26, x26, #16\n"
-      "ble 33f\n"
-      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
-      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15ab619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
-      ".inst 0xa042a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15ab71a  // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
-      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15ab61b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[1]\n"
-      "addvl x26, x26, #16\n"
-      "ble 33f\n"
-      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
-      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+      ".inst 0xc15bb218  // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[0]\n"
+      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bb299  // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[0]\n"
       ".inst 0xa042a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15abb9a  // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
-      ".inst 0xa043a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15aba9b  // bfdot za.s[x9, 3], { z20.h-z23.h }, z10.h[2]\n"
+      ".inst 0xc15bb39a  // bfdot za.s[x9, 2], { z28.h-z31.h }, z11.h[0]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15bb21b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bb418  // bfdot za.s[x9, 0], { z0.h-z3.h }, z11.h[1]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bb619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z11.h[1]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15bb61a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[1]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15bb61b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[1]\n"
       "addvl x26, x26, #16\n"
       "ble 33f\n"
       ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
-      ".inst 0xa041a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15abf19  // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15bba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[2]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z11.h[2]\n"
       ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15abe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
+      ".inst 0xc15bba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[2]\n"
       ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15abe1b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[3]\n"
+      ".inst 0xc15bba1b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15bbe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[3]\n"
+      ".inst 0xa041a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bbf19  // bfdot za.s[x9, 1], { z24.h-z27.h }, z11.h[3]\n"
+      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15bbe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[3]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15bbe1b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[3]\n"
       "addvl x26, x26, #16\n"
       "33:"  // Width 4: Multiply loop: multiply skip
       "tbz %x[flags], #1, 34f\n"
       "add x21, %x[args_ptr], %[offset_min]\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      "ld1rw { z0.s }, p1/Z, [x21]\n"
-      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
-      "ld1rw { z6.s }, p1/Z, [x20]\n"
-      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
-      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc1a6c814  // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
-      ".inst 0xc0062c78  // mova { z24.d-z27.d }, za.d[x9, #3]\n"
-      ".inst 0xa061c734  // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
-      ".inst 0xc1a6c810  // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
-      ".inst 0xa062c730  // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n"
-      ".inst 0xc1a6c818  // fclamp { z24.s-z27.s }, z0.s, z6.s\n"
-      ".inst 0xa063c338  // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      "ld1rw { z21.s }, p1/Z, [x21]\n"
+      ".inst 0xc0062c38  // mova { z24.d-z27.d }, za.d[x9, #1]\n"
+      "ld1rw { z20.s }, p1/Z, [x20]\n"
+      ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c72c  // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc0062c70  // mova { z16.d-z19.d }, za.d[x9, #3]\n"
+      ".inst 0xa061c738  // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc1b4caa0  // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+      ".inst 0xa062c720  // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      ".inst 0xa063c330  // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n"
       "addvl x25, x25, #16\n"
       "b 35f\n"
       "34:"  // Width 4: No activation
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
-      ".inst 0xa061c734  // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c72c  // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c730  // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
       ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
       ".inst 0xa062c730  // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n"
-      ".inst 0xc0062c78  // mova { z24.d-z27.d }, za.d[x9, #3]\n"
-      ".inst 0xa063c338  // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc0062c64  // mova { z4.d-z7.d }, za.d[x9, #3]\n"
+      ".inst 0xa063c324  // st1w { z4.s-z7.s }, p8, [x25, #0xc, MUL VL]\n"
       "addvl x25, x25, #16\n"
       "35:"  // Width 4: Output done
       "subs x27, x27, #0x4\n"
@@ -541,7 +541,7 @@
       "bgt 4b\n"
       "36:"  // Exit
       ".inst 0xd503467f  // SMSTOP\n"
-      "ptrue p1.b\n"
+      "ptrue p8.b\n"
       : [N] "+&r" (N)
       : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -550,5 +550,4 @@
 
 } // namespace arm_gemm
 
-#endif // ARM_COMPUTE_ENABLE_SME2
-#endif
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp
index f33cb9a..7d98d5c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,19 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
-#ifdef __aarch64__
+#if defined(ARM_COMPUTE_ENABLE_SME2)
 #include "../std_transforms_sme.hpp"
 
 #define ARGLIST  \
@@ -83,4 +82,4 @@
 
 #undef ARGLIST
 
-#endif // __aarch64__
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
index 9224868..d2c2605 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
-#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
 
 #include "arm_gemm.hpp"
 #include "../../utils.hpp"
@@ -61,7 +61,7 @@
             break;
     }
     __asm__ __volatile__(
-      "ptrue p1.b\n"
+      "ptrue p8.b\n"
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntw x28, ALL, MUL #4\n"
       "add x27, %x[N], x28\n"
@@ -101,311 +101,311 @@
       "bgt 20f\n"
       "beq 12f\n"
       "mov x23, %x[A_ptr]\n"
-      "lsl x22, %x[K], #0x2\n"
+      "lsl x21, %x[K], #0x2\n"
       "mov x20, %x[N]\n"
-      "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
       "cbz x24, 5f\n"
-      ".inst 0xa040c700  // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
-      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      ".inst 0xa040c718  // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042f00  // mova za.d[x9, #0], { z24.d-z27.d }\n"
       "b 6f\n"
       "5:"  // Width 1: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "6:"  // Width 1: setup done
-      "cmp x21, #0x4\n"
+      "cmp x22, #0x4\n"
       "ble 8f\n"
       "7:"  // Width 1: Multiply loop: Main loop head
-      "whilelt p0.s, XZR, x21\n"
-      "ld1rqw { z10.s }, p0/Z, [x23]\n"
-      "sub x21, x21, #0x4\n"
-      ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z8.s }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x4\n"
+      ".inst 0xa040c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158a280  // fmla za.s[x9, 0], { z20.s-z23.s }, z8.s[0]\n"
       "addvl x26, x26, #16\n"
-      "cmp x21, #0x4\n"
-      ".inst 0xa040c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+      "cmp x22, #0x4\n"
+      ".inst 0xa040c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158a480  // fmla za.s[x9, 0], { z4.s-z7.s }, z8.s[1]\n"
       "addvl x26, x26, #16\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+      ".inst 0xa040c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158ab80  // fmla za.s[x9, 0], { z28.s-z31.s }, z8.s[2]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+      ".inst 0xa040c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158af00  // fmla za.s[x9, 0], { z24.s-z27.s }, z8.s[3]\n"
       "addvl x26, x26, #16\n"
       "bgt 7b\n"
       "8:"  // Width 1: Multiply loop: Single iteration only
-      "whilelt p0.s, XZR, x21\n"
-      "ld1rqw { z10.s }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x1\n"
-      ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z11.s }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa040c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
-      "addvl x26, x26, #16\n"
-      "ble 9f\n"
-      ".inst 0xa040c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x1\n"
-      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+      ".inst 0xc15ba380  // fmla za.s[x9, 0], { z28.s-z31.s }, z11.s[0]\n"
       "addvl x26, x26, #16\n"
       "ble 9f\n"
       ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x1\n"
-      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15ba580  // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[1]\n"
       "addvl x26, x26, #16\n"
       "ble 9f\n"
       ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15baa00  // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15bac00  // fmla za.s[x9, 0], { z0.s-z3.s }, z11.s[3]\n"
       "addvl x26, x26, #16\n"
       "9:"  // Width 1: Multiply loop: multiply skip
       "tbz %x[flags], #1, 10f\n"
       "add x21, %x[args_ptr], %[offset_min]\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
       ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      "ld1rw { z0.s }, p1/Z, [x21]\n"
-      "ld1rw { z6.s }, p1/Z, [x20]\n"
-      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+      "ld1rw { z3.s }, p1/Z, [x21]\n"
+      "ld1rw { z29.s }, p1/Z, [x20]\n"
+      ".inst 0xc1bdc868  // fclamp { z8.s-z11.s }, z3.s, z29.s\n"
       ".inst 0xa060c328  // st1w { z8.s-z11.s }, p8, [x25]\n"
       "addvl x25, x25, #4\n"
       "b 11f\n"
       "10:"  // Width 1: No activation
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      ".inst 0xa060c328  // st1w { z8.s-z11.s }, p8, [x25]\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c32c  // st1w { z12.s-z15.s }, p8, [x25]\n"
       "addvl x25, x25, #4\n"
       "11:"  // Width 1: Output done
       "b 36f\n"
       "12:"  // Width 2
       "mov x23, %x[A_ptr]\n"
-      "lsl x22, %x[K], #0x2\n"
+      "lsl x21, %x[K], #0x2\n"
       "sub x20, %x[N], x28\n"
-      "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
       "cbz x24, 13f\n"
-      ".inst 0xa040c700  // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
-      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
-      ".inst 0xa041c708  // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
-      ".inst 0xc0042d01  // mova za.d[x9, #1], { z8.d-z11.d }\n"
+      ".inst 0xa040c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa041c714  // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
       "b 14f\n"
       "13:"  // Width 2: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "14:"  // Width 2: setup done
-      "cmp x21, #0x4\n"
+      "cmp x22, #0x4\n"
       "ble 16f\n"
       "15:"  // Width 2: Multiply loop: Main loop head
-      "whilelt p0.s, XZR, x21\n"
-      "ld1rqw { z10.s }, p0/Z, [x23]\n"
-      "sub x21, x21, #0x4\n"
-      ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
-      "cmp x21, #0x4\n"
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z9.s }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x4\n"
+      ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc159a180  // fmla za.s[x9, 0], { z12.s-z15.s }, z9.s[0]\n"
+      "cmp x22, #0x4\n"
       "add x23, x23, #0x10\n"
       ".inst 0xa041c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aa081  // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa040c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
-      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aa601  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+      ".inst 0xc159a081  // fmla za.s[x9, 1], { z4.s-z7.s }, z9.s[0]\n"
       "addvl x26, x26, #16\n"
       ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
-      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aaa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+      ".inst 0xc159a580  // fmla za.s[x9, 0], { z12.s-z15.s }, z9.s[1]\n"
+      ".inst 0xa041c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc159a481  // fmla za.s[x9, 1], { z4.s-z7.s }, z9.s[1]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
-      ".inst 0xa041c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aaf01  // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+      ".inst 0xa040c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc159ab00  // fmla za.s[x9, 0], { z24.s-z27.s }, z9.s[2]\n"
+      ".inst 0xa041c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc159a801  // fmla za.s[x9, 1], { z0.s-z3.s }, z9.s[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc159ac00  // fmla za.s[x9, 0], { z0.s-z3.s }, z9.s[3]\n"
+      ".inst 0xa041c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc159af81  // fmla za.s[x9, 1], { z28.s-z31.s }, z9.s[3]\n"
       "addvl x26, x26, #16\n"
       "bgt 15b\n"
       "16:"  // Width 2: Multiply loop: Single iteration only
-      "whilelt p0.s, XZR, x21\n"
-      "ld1rqw { z10.s }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x1\n"
-      ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z11.s }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
-      ".inst 0xa041c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aa081  // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+      ".inst 0xc15ba180  // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[0]\n"
+      ".inst 0xa041c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15ba001  // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[0]\n"
       "addvl x26, x26, #16\n"
       "ble 17f\n"
       ".inst 0xa040c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x1\n"
-      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
-      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aa601  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15ba700  // fmla za.s[x9, 0], { z24.s-z27.s }, z11.s[1]\n"
+      ".inst 0xa041c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15ba401  // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[1]\n"
       "addvl x26, x26, #16\n"
       "ble 17f\n"
       ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x1\n"
-      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
-      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aaa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15ba980  // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[2]\n"
+      ".inst 0xa041c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bab81  // fmla za.s[x9, 1], { z28.s-z31.s }, z11.s[2]\n"
       "addvl x26, x26, #16\n"
       "ble 17f\n"
       ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
-      ".inst 0xa041c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aaf01  // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+      ".inst 0xc15bae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[3]\n"
+      ".inst 0xa041c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bae81  // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[3]\n"
       "addvl x26, x26, #16\n"
       "17:"  // Width 2: Multiply loop: multiply skip
       "tbz %x[flags], #1, 18f\n"
       "add x21, %x[args_ptr], %[offset_min]\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      "ld1rw { z0.s }, p1/Z, [x21]\n"
-      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
-      "ld1rw { z6.s }, p1/Z, [x20]\n"
-      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc1a6c814  // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
-      ".inst 0xa061c334  // st1w { z20.s-z23.s }, p8, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c00  // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+      "ld1rw { z9.s }, p1/Z, [x21]\n"
+      ".inst 0xc0062c24  // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+      "ld1rw { z8.s }, p1/Z, [x20]\n"
+      ".inst 0xc1a8c920  // fclamp { z0.s-z3.s }, z9.s, z8.s\n"
+      ".inst 0xa060c720  // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+      ".inst 0xc1a8c924  // fclamp { z4.s-z7.s }, z9.s, z8.s\n"
+      ".inst 0xa061c324  // st1w { z4.s-z7.s }, p8, [x25, #0x4, MUL VL]\n"
       "addvl x25, x25, #8\n"
       "b 19f\n"
       "18:"  // Width 2: No activation
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
-      ".inst 0xa061c334  // st1w { z20.s-z23.s }, p8, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c10  // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c730  // st1w { z16.s-z19.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n"
       "addvl x25, x25, #8\n"
       "19:"  // Width 2: Output done
       "b 36f\n"
       "20:"  // Width 3
       "mov x20, #0x2\n"
       "mov x23, %x[A_ptr]\n"
-      "lsl x22, %x[K], #0x2\n"
+      "lsl x21, %x[K], #0x2\n"
       "msub x20, x28, x20, %x[N]\n"
-      "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
       "cbz x24, 21f\n"
-      ".inst 0xa040c700  // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
-      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
-      ".inst 0xa041c708  // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
-      ".inst 0xc0042d01  // mova za.d[x9, #1], { z8.d-z11.d }\n"
-      ".inst 0xa042c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
-      ".inst 0xc0042c82  // mova za.d[x9, #2], { z4.d-z7.d }\n"
+      ".inst 0xa040c718  // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042f00  // mova za.d[x9, #0], { z24.d-z27.d }\n"
+      ".inst 0xa041c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042c81  // mova za.d[x9, #1], { z4.d-z7.d }\n"
+      ".inst 0xa042c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042e02  // mova za.d[x9, #2], { z16.d-z19.d }\n"
       "b 22f\n"
       "21:"  // Width 3: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "22:"  // Width 3: setup done
-      "cmp x21, #0x4\n"
+      "cmp x22, #0x4\n"
       "ble 24f\n"
       "23:"  // Width 3: Multiply loop: Main loop head
-      "whilelt p0.s, XZR, x21\n"
-      "ld1rqw { z10.s }, p0/Z, [x23]\n"
-      "sub x21, x21, #0x4\n"
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z15.s }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x4\n"
       ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
-      "cmp x21, #0x4\n"
+      ".inst 0xc15fa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z15.s[0]\n"
+      "cmp x22, #0x4\n"
       "add x23, x23, #0x10\n"
       ".inst 0xa041c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aa081  // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
-      ".inst 0xa042c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aa282  // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
+      ".inst 0xc15fa081  // fmla za.s[x9, 1], { z4.s-z7.s }, z15.s[0]\n"
+      ".inst 0xa042c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fa002  // fmla za.s[x9, 2], { z0.s-z3.s }, z15.s[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fa680  // fmla za.s[x9, 0], { z20.s-z23.s }, z15.s[1]\n"
+      ".inst 0xa041c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fa681  // fmla za.s[x9, 1], { z20.s-z23.s }, z15.s[1]\n"
+      ".inst 0xa042c749  // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fa502  // fmla za.s[x9, 2], { z8.s-z11.s }, z15.s[1]\n"
       "addvl x26, x26, #16\n"
       ".inst 0xa040c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
-      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aa601  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
-      ".inst 0xa042c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aa702  // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
-      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aaa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
-      ".inst 0xa042c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aab82  // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
+      ".inst 0xc15fab00  // fmla za.s[x9, 0], { z24.s-z27.s }, z15.s[2]\n"
+      ".inst 0xa041c749  // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fa901  // fmla za.s[x9, 1], { z8.s-z11.s }, z15.s[2]\n"
+      ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15faa02  // fmla za.s[x9, 2], { z16.s-z19.s }, z15.s[2]\n"
       "addvl x26, x26, #16\n"
       ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
-      ".inst 0xa041c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aaf01  // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
-      ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aae02  // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
+      ".inst 0xc15fae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z15.s[3]\n"
+      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fae01  // fmla za.s[x9, 1], { z16.s-z19.s }, z15.s[3]\n"
+      ".inst 0xa042c749  // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fad02  // fmla za.s[x9, 2], { z8.s-z11.s }, z15.s[3]\n"
       "addvl x26, x26, #16\n"
       "bgt 23b\n"
       "24:"  // Width 3: Multiply loop: Single iteration only
-      "whilelt p0.s, XZR, x21\n"
-      "ld1rqw { z10.s }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x1\n"
-      ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z11.s }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa040c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
-      ".inst 0xa041c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aa081  // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+      ".inst 0xc15ba380  // fmla za.s[x9, 0], { z28.s-z31.s }, z11.s[0]\n"
+      ".inst 0xa041c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15ba001  // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[0]\n"
       ".inst 0xa042c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aa282  // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
-      "addvl x26, x26, #16\n"
-      "ble 25f\n"
-      ".inst 0xa040c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x1\n"
-      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
-      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aa601  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
-      ".inst 0xa042c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aa702  // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
+      ".inst 0xc15ba282  // fmla za.s[x9, 2], { z20.s-z23.s }, z11.s[0]\n"
       "addvl x26, x26, #16\n"
       "ble 25f\n"
       ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x1\n"
-      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
-      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aaa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15ba580  // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[1]\n"
+      ".inst 0xa041c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15ba481  // fmla za.s[x9, 1], { z4.s-z7.s }, z11.s[1]\n"
       ".inst 0xa042c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aab82  // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
+      ".inst 0xc15ba782  // fmla za.s[x9, 2], { z28.s-z31.s }, z11.s[1]\n"
       "addvl x26, x26, #16\n"
       "ble 25f\n"
-      ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
-      ".inst 0xa041c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aaf01  // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+      ".inst 0xa040c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15ba880  // fmla za.s[x9, 0], { z4.s-z7.s }, z11.s[2]\n"
+      ".inst 0xa041c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15baa81  // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[2]\n"
+      ".inst 0xa042c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15ba982  // fmla za.s[x9, 2], { z12.s-z15.s }, z11.s[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15bad80  // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[3]\n"
+      ".inst 0xa041c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15bae81  // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[3]\n"
       ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aae02  // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
+      ".inst 0xc15bae02  // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[3]\n"
       "addvl x26, x26, #16\n"
       "25:"  // Width 3: Multiply loop: multiply skip
       "tbz %x[flags], #1, 26f\n"
       "add x21, %x[args_ptr], %[offset_min]\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      "ld1rw { z0.s }, p1/Z, [x21]\n"
-      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
-      "ld1rw { z6.s }, p1/Z, [x20]\n"
-      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
-      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc1a6c814  // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
-      ".inst 0xa061c734  // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
-      ".inst 0xc1a6c810  // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
-      ".inst 0xa062c330  // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc0062c04  // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+      "ld1rw { z17.s }, p1/Z, [x21]\n"
+      ".inst 0xc0062c28  // mova { z8.d-z11.d }, za.d[x9, #1]\n"
+      "ld1rw { z16.s }, p1/Z, [x20]\n"
+      ".inst 0xc1b0ca24  // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c724  // st1w { z4.s-z7.s }, pn9.b, [x25]\n"
+      ".inst 0xc1b0ca28  // fclamp { z8.s-z11.s }, z17.s, z16.s\n"
+      ".inst 0xa061c728  // st1w { z8.s-z11.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc1b0ca2c  // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+      ".inst 0xa062c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
       "addvl x25, x25, #12\n"
       "b 27f\n"
       "26:"  // Width 3: No activation
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
-      ".inst 0xa061c734  // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
-      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
-      ".inst 0xa062c330  // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc0062c14  // mova { z20.d-z23.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c734  // st1w { z20.s-z23.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c72c  // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
       "addvl x25, x25, #12\n"
       "27:"  // Width 3: Output done
       "b 36f\n"
       "28:"  // Width 4
       "mov x20, #0x3\n"
       "mov x23, %x[A_ptr]\n"
-      "lsl x22, %x[K], #0x2\n"
+      "lsl x21, %x[K], #0x2\n"
       "msub x20, x28, x20, %x[N]\n"
-      "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
       "cbz x24, 29f\n"
-      ".inst 0xa040c700  // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
-      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
-      ".inst 0xa041c708  // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
-      ".inst 0xc0042d01  // mova za.d[x9, #1], { z8.d-z11.d }\n"
-      ".inst 0xa042c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
-      ".inst 0xc0042c82  // mova za.d[x9, #2], { z4.d-z7.d }\n"
+      ".inst 0xa040c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa041c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
+      ".inst 0xa042c70c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042d82  // mova za.d[x9, #2], { z12.d-z15.d }\n"
       ".inst 0xa043c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
       ".inst 0xc0042e03  // mova za.d[x9, #3], { z16.d-z19.d }\n"
       "addvl x24, x24, #16\n"
@@ -413,126 +413,126 @@
       "29:"  // Width 4: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "30:"  // Width 4: setup done
-      "cmp x21, #0x4\n"
+      "cmp x22, #0x4\n"
       "ble 32f\n"
       "31:"  // Width 4: Multiply loop: Main loop head
-      "whilelt p0.s, XZR, x21\n"
-      "ld1rqw { z10.s }, p0/Z, [x23]\n"
-      "sub x21, x21, #0x4\n"
-      ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
-      "cmp x21, #0x4\n"
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z8.s }, p0/Z, [x23]\n"
+      "sub x22, x22, #0x4\n"
+      ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158a200  // fmla za.s[x9, 0], { z16.s-z19.s }, z8.s[0]\n"
+      "cmp x22, #0x4\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xa041c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aa081  // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
-      ".inst 0xa042c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aa282  // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
-      ".inst 0xa043c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15aa203  // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[0]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa040c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
-      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aa601  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
-      ".inst 0xa042c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aa702  // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
-      ".inst 0xa043c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15aa603  // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[1]\n"
+      ".inst 0xa041c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158a181  // fmla za.s[x9, 1], { z12.s-z15.s }, z8.s[0]\n"
+      ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc158a202  // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[0]\n"
+      ".inst 0xa043c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158a183  // fmla za.s[x9, 3], { z12.s-z15.s }, z8.s[0]\n"
       "addvl x26, x26, #16\n"
       ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
-      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aaa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
-      ".inst 0xa042c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aab82  // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
-      ".inst 0xa043c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15aaa83  // fmla za.s[x9, 3], { z20.s-z23.s }, z10.s[2]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
-      ".inst 0xa041c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aaf01  // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+      ".inst 0xc158a580  // fmla za.s[x9, 0], { z12.s-z15.s }, z8.s[1]\n"
+      ".inst 0xa041c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158a681  // fmla za.s[x9, 1], { z20.s-z23.s }, z8.s[1]\n"
       ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aae02  // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
-      ".inst 0xa043c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15aae03  // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[3]\n"
+      ".inst 0xc158a602  // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[1]\n"
+      ".inst 0xa043c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158a683  // fmla za.s[x9, 3], { z20.s-z23.s }, z8.s[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158a880  // fmla za.s[x9, 0], { z4.s-z7.s }, z8.s[2]\n"
+      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158aa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z8.s[2]\n"
+      ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc158aa02  // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[2]\n"
+      ".inst 0xa043c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158a803  // fmla za.s[x9, 3], { z0.s-z3.s }, z8.s[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc158ae80  // fmla za.s[x9, 0], { z20.s-z23.s }, z8.s[3]\n"
+      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc158ae01  // fmla za.s[x9, 1], { z16.s-z19.s }, z8.s[3]\n"
+      ".inst 0xa042c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc158ac82  // fmla za.s[x9, 2], { z4.s-z7.s }, z8.s[3]\n"
+      ".inst 0xa043c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc158ae83  // fmla za.s[x9, 3], { z20.s-z23.s }, z8.s[3]\n"
       "addvl x26, x26, #16\n"
       "bgt 31b\n"
       "32:"  // Width 4: Multiply loop: Single iteration only
-      "whilelt p0.s, XZR, x21\n"
-      "ld1rqw { z10.s }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x1\n"
-      ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+      "whilelt p0.s, XZR, x22\n"
+      "ld1rqw { z11.s }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
-      ".inst 0xa041c745  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aa081  // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
-      ".inst 0xa042c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aa282  // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
-      ".inst 0xa043c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15aa203  // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[0]\n"
-      "addvl x26, x26, #16\n"
-      "ble 33f\n"
-      ".inst 0xa040c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x1\n"
-      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
-      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aa601  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
-      ".inst 0xa042c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aa702  // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
-      ".inst 0xa043c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15aa603  // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[1]\n"
-      "addvl x26, x26, #16\n"
-      "ble 33f\n"
-      ".inst 0xa040c74d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x1\n"
-      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
-      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aaa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+      ".inst 0xc15ba200  // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[0]\n"
+      ".inst 0xa041c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15ba281  // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[0]\n"
       ".inst 0xa042c75d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aab82  // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
-      ".inst 0xa043c755  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15aaa83  // fmla za.s[x9, 3], { z20.s-z23.s }, z10.s[2]\n"
+      ".inst 0xc15ba382  // fmla za.s[x9, 2], { z28.s-z31.s }, z11.s[0]\n"
+      ".inst 0xa043c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15ba203  // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040c741  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15ba400  // fmla za.s[x9, 0], { z0.s-z3.s }, z11.s[1]\n"
+      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15ba601  // fmla za.s[x9, 1], { z16.s-z19.s }, z11.s[1]\n"
+      ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15ba602  // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[1]\n"
+      ".inst 0xa043c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15ba603  // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[1]\n"
       "addvl x26, x26, #16\n"
       "ble 33f\n"
       ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
-      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
-      ".inst 0xa041c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc15aaf01  // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xc15baa00  // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[2]\n"
+      ".inst 0xa041c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15baa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z11.s[2]\n"
       ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc15aae02  // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
+      ".inst 0xc15baa02  // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[2]\n"
       ".inst 0xa043c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc15aae03  // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[3]\n"
+      ".inst 0xc15baa03  // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15bae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[3]\n"
+      ".inst 0xa041c759  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15baf01  // fmla za.s[x9, 1], { z24.s-z27.s }, z11.s[3]\n"
+      ".inst 0xa042c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15bae02  // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[3]\n"
+      ".inst 0xa043c751  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15bae03  // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[3]\n"
       "addvl x26, x26, #16\n"
       "33:"  // Width 4: Multiply loop: multiply skip
       "tbz %x[flags], #1, 34f\n"
       "add x21, %x[args_ptr], %[offset_min]\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      "ld1rw { z0.s }, p1/Z, [x21]\n"
-      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
-      "ld1rw { z6.s }, p1/Z, [x20]\n"
-      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
-      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc1a6c814  // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
-      ".inst 0xc0062c78  // mova { z24.d-z27.d }, za.d[x9, #3]\n"
-      ".inst 0xa061c734  // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
-      ".inst 0xc1a6c810  // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
-      ".inst 0xa062c730  // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n"
-      ".inst 0xc1a6c818  // fclamp { z24.s-z27.s }, z0.s, z6.s\n"
-      ".inst 0xa063c338  // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      "ld1rw { z21.s }, p1/Z, [x21]\n"
+      ".inst 0xc0062c38  // mova { z24.d-z27.d }, za.d[x9, #1]\n"
+      "ld1rw { z20.s }, p1/Z, [x20]\n"
+      ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c72c  // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc0062c70  // mova { z16.d-z19.d }, za.d[x9, #3]\n"
+      ".inst 0xa061c738  // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc1b4caa0  // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+      ".inst 0xa062c720  // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      ".inst 0xa063c330  // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n"
       "addvl x25, x25, #16\n"
       "b 35f\n"
       "34:"  // Width 4: No activation
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
-      ".inst 0xa061c734  // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c72c  // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c730  // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
       ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
       ".inst 0xa062c730  // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n"
-      ".inst 0xc0062c78  // mova { z24.d-z27.d }, za.d[x9, #3]\n"
-      ".inst 0xa063c338  // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc0062c64  // mova { z4.d-z7.d }, za.d[x9, #3]\n"
+      ".inst 0xa063c324  // st1w { z4.s-z7.s }, p8, [x25, #0xc, MUL VL]\n"
       "addvl x25, x25, #16\n"
       "35:"  // Width 4: Output done
       "subs x27, x27, #0x4\n"
@@ -540,7 +540,7 @@
       "bgt 4b\n"
       "36:"  // Exit
       ".inst 0xd503467f  // SMSTOP\n"
-      "ptrue p1.b\n"
+      "ptrue p8.b\n"
       : [N] "+&r" (N)
       : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -549,5 +549,4 @@
 
 } // namespace arm_gemm
 
-#endif // ARM_COMPUTE_ENABLE_SME2
-#endif
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp
index f52fbcd..76c2bdd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,19 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
-#ifdef __aarch64__
+#if defined(ARM_COMPUTE_ENABLE_SME2)
 #include "../std_transforms_sme.hpp"
 #include "../bfloat.hpp"
 
@@ -84,4 +83,4 @@
 
 #undef ARGLIST
 
-#endif // __aarch64__
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
index 0a394b6..c6fa110 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
-#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
 
 #include "arm_gemm.hpp"
 #include "../../utils.hpp"
@@ -62,7 +62,7 @@
             break;
     }
     __asm__ __volatile__(
-      "ptrue p2.b\n"
+      "ptrue p8.b\n"
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntw x10, ALL, MUL #4\n"
       "add x28, %x[N], x10\n"
@@ -103,494 +103,494 @@
       "bgt 20f\n"
       "beq 12f\n"
       "mov x23, %x[A_ptr]\n"
-      "lsl x22, %x[K], #0x2\n"
+      "lsl x21, %x[K], #0x2\n"
       "mov x20, %x[N]\n"
-      "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
       "cbz x24, 5f\n"
-      ".inst 0xa040c700  // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
-      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      ".inst 0xa040c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042e00  // mova za.d[x9, #0], { z16.d-z19.d }\n"
       "b 6f\n"
       "5:"  // Width 1: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "6:"  // Width 1: setup done
-      "cmp x21, #0x8\n"
+      "cmp x22, #0x8\n"
       "ble 8f\n"
       "7:"  // Width 1: Multiply loop: Main loop head
-      "whilelt p1.s, XZR, x21\n"
-      "whilelt p0.s, x27, x21\n"
-      "ld1rqw { z0.s }, p1/Z, [x23]\n"
-      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
-      "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
-      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "sub x21, x21, #0x8\n"
-      "uzp1 z11.h, z11.h, z11.h\n"
-      "trn1 z0.d, z0.d, z11.d\n"
-      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z10.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa94a  // bfcvt z10.h, p2/M, z10.s\n"
+      "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aaa10  // bfcvt z16.h, p2/M, z16.s\n"
+      "uzp1 z10.h, z10.h, z10.h\n"
+      "sub x22, x22, #0x8\n"
+      "uzp1 z16.h, z16.h, z16.h\n"
+      "trn1 z10.d, z10.d, z16.d\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15ab198  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[0]\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
       "addvl x26, x26, #16\n"
-      "cmp x21, #0x8\n"
-      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      "cmp x22, #0x8\n"
+      ".inst 0xc15ab598  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[1]\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
       "addvl x26, x26, #16\n"
       "add x23, x23, #0x20\n"
-      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+      ".inst 0xc15ab818  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[2]\n"
       ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+      ".inst 0xc15abf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z10.h[3]\n"
       "bgt 7b\n"
       "8:"  // Width 1: Multiply loop: Single iteration only
-      "whilelt p1.s, XZR, x21\n"
-      "whilelt p0.s, x27, x21\n"
-      "ld1rqw { z0.s }, p1/Z, [x23]\n"
-      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
-      "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
-      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "subs x21, x21, #0x2\n"
-      "uzp1 z11.h, z11.h, z11.h\n"
-      "trn1 z0.d, z0.d, z11.d\n"
-      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z15.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa9ef  // bfcvt z15.h, p2/M, z15.s\n"
+      "ld1rqw { z17.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aaa31  // bfcvt z17.h, p2/M, z17.s\n"
+      "uzp1 z15.h, z15.h, z15.h\n"
+      "subs x22, x22, #0x2\n"
+      "uzp1 z17.h, z17.h, z17.h\n"
+      "trn1 z15.d, z15.d, z17.d\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
       "add x23, x23, #0x20\n"
-      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+      ".inst 0xc15fb218  // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n"
       "addvl x26, x26, #16\n"
       "ble 9f\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fb418  // bfdot za.s[x9, 0], { z0.h-z3.h }, z15.h[1]\n"
       "addvl x26, x26, #16\n"
       "ble 9f\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fb898  // bfdot za.s[x9, 0], { z4.h-z7.h }, z15.h[2]\n"
       "addvl x26, x26, #16\n"
       "ble 9f\n"
-      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+      ".inst 0xa040a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fbd18  // bfdot za.s[x9, 0], { z8.h-z11.h }, z15.h[3]\n"
       "addvl x26, x26, #16\n"
       "9:"  // Width 1: Multiply loop: multiply skip
       "tbz %x[flags], #1, 10f\n"
       "add x21, %x[args_ptr], %[offset_min]\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      "ld1rw { z29.s }, p2/Z, [x21]\n"
-      "ld1rw { z18.s }, p2/Z, [x20]\n"
-      ".inst 0xc1b2cba8  // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
-      ".inst 0xa060c328  // st1w { z8.s-z11.s }, p8, [x25]\n"
+      ".inst 0xc0062c00  // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+      "ld1rw { z8.s }, p2/Z, [x21]\n"
+      "ld1rw { z26.s }, p2/Z, [x20]\n"
+      ".inst 0xc1bac900  // fclamp { z0.s-z3.s }, z8.s, z26.s\n"
+      ".inst 0xa060c320  // st1w { z0.s-z3.s }, p8, [x25]\n"
       "addvl x25, x25, #4\n"
       "b 11f\n"
       "10:"  // Width 1: No activation
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      ".inst 0xa060c328  // st1w { z8.s-z11.s }, p8, [x25]\n"
+      ".inst 0xc0062c04  // mova { z4.d-z7.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c324  // st1w { z4.s-z7.s }, p8, [x25]\n"
       "addvl x25, x25, #4\n"
       "11:"  // Width 1: Output done
       "b 36f\n"
       "12:"  // Width 2
       "mov x23, %x[A_ptr]\n"
-      "lsl x22, %x[K], #0x2\n"
+      "lsl x21, %x[K], #0x2\n"
       "sub x20, %x[N], x10\n"
-      "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
       "cbz x24, 13f\n"
-      ".inst 0xa040c700  // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
-      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      ".inst 0xa040c718  // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042f00  // mova za.d[x9, #0], { z24.d-z27.d }\n"
       ".inst 0xa041c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
       ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
       "b 14f\n"
       "13:"  // Width 2: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "14:"  // Width 2: setup done
-      "cmp x21, #0x8\n"
+      "cmp x22, #0x8\n"
       "ble 16f\n"
       "15:"  // Width 2: Multiply loop: Main loop head
-      "whilelt p1.s, XZR, x21\n"
-      "whilelt p0.s, x27, x21\n"
-      "ld1rqw { z0.s }, p1/Z, [x23]\n"
-      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
-      "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
-      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "sub x21, x21, #0x8\n"
-      "uzp1 z11.h, z11.h, z11.h\n"
-      "trn1 z0.d, z0.d, z11.d\n"
-      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
-      "cmp x21, #0x8\n"
-      ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z13.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa9ad  // bfcvt z13.h, p2/M, z13.s\n"
+      "ld1rqw { z27.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aab7b  // bfcvt z27.h, p2/M, z27.s\n"
+      "uzp1 z13.h, z13.h, z13.h\n"
+      "sub x22, x22, #0x8\n"
+      "uzp1 z27.h, z27.h, z27.h\n"
+      "trn1 z13.d, z13.d, z27.d\n"
+      ".inst 0xa040a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+      "cmp x22, #0x8\n"
+      ".inst 0xa041a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15db298  // bfdot za.s[x9, 0], { z20.h-z23.h }, z13.h[0]\n"
       "addvl x26, x26, #16\n"
       "add x23, x23, #0x20\n"
-      ".inst 0xc150b119  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15db019  // bfdot za.s[x9, 1], { z0.h-z3.h }, z13.h[0]\n"
+      ".inst 0xa040a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15db698  // bfdot za.s[x9, 0], { z20.h-z23.h }, z13.h[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15db719  // bfdot za.s[x9, 1], { z24.h-z27.h }, z13.h[1]\n"
+      ".inst 0xa040a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15db918  // bfdot za.s[x9, 0], { z8.h-z11.h }, z13.h[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15dba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z13.h[2]\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
       ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+      ".inst 0xc15dbc18  // bfdot za.s[x9, 0], { z0.h-z3.h }, z13.h[3]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xc150b499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xc150ba99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
-      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xc150bd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+      ".inst 0xc15dbc99  // bfdot za.s[x9, 1], { z4.h-z7.h }, z13.h[3]\n"
       "bgt 15b\n"
       "16:"  // Width 2: Multiply loop: Single iteration only
-      "whilelt p1.s, XZR, x21\n"
-      "whilelt p0.s, x27, x21\n"
-      "ld1rqw { z0.s }, p1/Z, [x23]\n"
-      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
-      "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
-      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "subs x21, x21, #0x2\n"
-      "uzp1 z11.h, z11.h, z11.h\n"
-      "trn1 z0.d, z0.d, z11.d\n"
-      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z15.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa9ef  // bfcvt z15.h, p2/M, z15.s\n"
+      "ld1rqw { z5.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aa8a5  // bfcvt z5.h, p2/M, z5.s\n"
+      "uzp1 z15.h, z15.h, z15.h\n"
+      "subs x22, x22, #0x2\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "trn1 z15.d, z15.d, z5.d\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
       "add x23, x23, #0x20\n"
-      ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+      ".inst 0xa041a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb218  // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xc150b119  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+      ".inst 0xc15fb319  // bfdot za.s[x9, 1], { z24.h-z27.h }, z15.h[0]\n"
       "ble 17f\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fb798  // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[1]\n"
       ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150b499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
+      ".inst 0xc15fb499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[1]\n"
       "addvl x26, x26, #16\n"
       "ble 17f\n"
       ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
-      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150ba99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[2]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[2]\n"
       "addvl x26, x26, #16\n"
       "ble 17f\n"
       ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+      ".inst 0xc15fbf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[3]\n"
       ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150bd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+      ".inst 0xc15fbd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[3]\n"
       "addvl x26, x26, #16\n"
       "17:"  // Width 2: Multiply loop: multiply skip
       "tbz %x[flags], #1, 18f\n"
       "add x21, %x[args_ptr], %[offset_min]\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      "ld1rw { z29.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c14  // mova { z20.d-z23.d }, za.d[x9, #0]\n"
+      "ld1rw { z11.s }, p2/Z, [x21]\n"
       ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
-      "ld1rw { z18.s }, p2/Z, [x20]\n"
-      ".inst 0xc1b2cba8  // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc1b2cbac  // fclamp { z12.s-z15.s }, z29.s, z18.s\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      ".inst 0xc1bcc974  // fclamp { z20.s-z23.s }, z11.s, z28.s\n"
+      ".inst 0xa060c734  // st1w { z20.s-z23.s }, pn9.b, [x25]\n"
+      ".inst 0xc1bcc96c  // fclamp { z12.s-z15.s }, z11.s, z28.s\n"
       ".inst 0xa061c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n"
       "addvl x25, x25, #8\n"
       "b 19f\n"
       "18:"  // Width 2: No activation
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
-      ".inst 0xa061c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c00  // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c720  // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c20  // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c320  // st1w { z0.s-z3.s }, p8, [x25, #0x4, MUL VL]\n"
       "addvl x25, x25, #8\n"
       "19:"  // Width 2: Output done
       "b 36f\n"
       "20:"  // Width 3
       "mov x20, #0x2\n"
       "mov x23, %x[A_ptr]\n"
-      "lsl x22, %x[K], #0x2\n"
+      "lsl x21, %x[K], #0x2\n"
       "msub x20, x10, x20, %x[N]\n"
-      "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
       "cbz x24, 21f\n"
-      ".inst 0xa040c700  // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
-      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
-      ".inst 0xa041c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
-      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
-      ".inst 0xa042c71c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
-      ".inst 0xc0042f82  // mova za.d[x9, #2], { z28.d-z31.d }\n"
+      ".inst 0xa040c71c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042f80  // mova za.d[x9, #0], { z28.d-z31.d }\n"
+      ".inst 0xa041c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042c81  // mova za.d[x9, #1], { z4.d-z7.d }\n"
+      ".inst 0xa042c704  // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042c82  // mova za.d[x9, #2], { z4.d-z7.d }\n"
       "b 22f\n"
       "21:"  // Width 3: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "22:"  // Width 3: setup done
-      "cmp x21, #0x8\n"
+      "cmp x22, #0x8\n"
       "ble 24f\n"
       "23:"  // Width 3: Multiply loop: Main loop head
-      "whilelt p1.s, XZR, x21\n"
-      "whilelt p0.s, x27, x21\n"
-      "ld1rqw { z0.s }, p1/Z, [x23]\n"
-      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
-      "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
-      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "sub x21, x21, #0x8\n"
-      "uzp1 z11.h, z11.h, z11.h\n"
-      "trn1 z0.d, z0.d, z11.d\n"
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z14.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa9ce  // bfcvt z14.h, p2/M, z14.s\n"
+      "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aaa10  // bfcvt z16.h, p2/M, z16.s\n"
+      "uzp1 z14.h, z14.h, z14.h\n"
+      "sub x22, x22, #0x8\n"
+      "uzp1 z16.h, z16.h, z16.h\n"
+      "trn1 z14.d, z14.d, z16.d\n"
       ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
-      "cmp x21, #0x8\n"
-      ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+      "cmp x22, #0x8\n"
+      ".inst 0xa041a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15eb098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z14.h[0]\n"
       "add x23, x23, #0x20\n"
-      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150b119  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+      ".inst 0xa042a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15eb319  // bfdot za.s[x9, 1], { z24.h-z27.h }, z14.h[0]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xc150b21a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15eb01a  // bfdot za.s[x9, 2], { z0.h-z3.h }, z14.h[0]\n"
+      ".inst 0xa040a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
       ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
-      ".inst 0xa042a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150b499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xc150b59a  // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+      ".inst 0xc15eb518  // bfdot za.s[x9, 0], { z8.h-z11.h }, z14.h[1]\n"
       ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150ba99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+      ".inst 0xc15eb499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z14.h[1]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xc150ba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
-      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+      ".inst 0xc15eb61a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z14.h[1]\n"
+      ".inst 0xa040a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15eb818  // bfdot za.s[x9, 0], { z0.h-z3.h }, z14.h[2]\n"
+      ".inst 0xa042a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15ebb99  // bfdot za.s[x9, 1], { z28.h-z31.h }, z14.h[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15eb81a  // bfdot za.s[x9, 2], { z0.h-z3.h }, z14.h[2]\n"
+      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15ebf18  // bfdot za.s[x9, 0], { z24.h-z27.h }, z14.h[3]\n"
       ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150bd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+      ".inst 0xc15ebf99  // bfdot za.s[x9, 1], { z28.h-z31.h }, z14.h[3]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xc150be1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
+      ".inst 0xc15ebe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z14.h[3]\n"
       "bgt 23b\n"
       "24:"  // Width 3: Multiply loop: Single iteration only
-      "whilelt p1.s, XZR, x21\n"
-      "whilelt p0.s, x27, x21\n"
-      "ld1rqw { z0.s }, p1/Z, [x23]\n"
-      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
-      "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
-      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "subs x21, x21, #0x2\n"
-      "uzp1 z11.h, z11.h, z11.h\n"
-      "trn1 z0.d, z0.d, z11.d\n"
-      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z15.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa9ef  // bfcvt z15.h, p2/M, z15.s\n"
+      "ld1rqw { z31.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aabff  // bfcvt z31.h, p2/M, z31.s\n"
+      "uzp1 z15.h, z15.h, z15.h\n"
+      "subs x22, x22, #0x2\n"
+      "uzp1 z31.h, z31.h, z31.h\n"
+      "trn1 z15.d, z15.d, z31.d\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
       "add x23, x23, #0x20\n"
-      ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
-      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150b119  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+      ".inst 0xa041a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb218  // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n"
+      ".inst 0xa042a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fb019  // bfdot za.s[x9, 1], { z0.h-z3.h }, z15.h[0]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xc150b21a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
+      ".inst 0xc15fb09a  // bfdot za.s[x9, 2], { z4.h-z7.h }, z15.h[0]\n"
       "ble 25f\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
-      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150b499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
-      ".inst 0xa042a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150b59a  // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
-      "addvl x26, x26, #16\n"
-      "ble 25f\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+      ".inst 0xa040a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fb698  // bfdot za.s[x9, 0], { z20.h-z23.h }, z15.h[1]\n"
       ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150ba99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+      ".inst 0xc15fb699  // bfdot za.s[x9, 1], { z20.h-z23.h }, z15.h[1]\n"
       ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150ba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
+      ".inst 0xc15fb61a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fb898  // bfdot za.s[x9, 0], { z4.h-z7.h }, z15.h[2]\n"
+      ".inst 0xa041a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb819  // bfdot za.s[x9, 1], { z0.h-z3.h }, z15.h[2]\n"
+      ".inst 0xa042a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fbb1a  // bfdot za.s[x9, 2], { z24.h-z27.h }, z15.h[2]\n"
       "addvl x26, x26, #16\n"
       "ble 25f\n"
       ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+      ".inst 0xc15fbf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[3]\n"
       ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150bd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
-      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150be1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
+      ".inst 0xc15fbd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[3]\n"
+      ".inst 0xa042a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fbc9a  // bfdot za.s[x9, 2], { z4.h-z7.h }, z15.h[3]\n"
       "addvl x26, x26, #16\n"
       "25:"  // Width 3: Multiply loop: multiply skip
       "tbz %x[flags], #1, 26f\n"
       "add x21, %x[args_ptr], %[offset_min]\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      "ld1rw { z29.s }, p2/Z, [x21]\n"
-      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
-      "ld1rw { z18.s }, p2/Z, [x20]\n"
-      ".inst 0xc1b2cba8  // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
-      ".inst 0xc0062c44  // mova { z4.d-z7.d }, za.d[x9, #2]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc1b2cbac  // fclamp { z12.s-z15.s }, z29.s, z18.s\n"
-      ".inst 0xa061c72c  // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
-      ".inst 0xc1b2cba4  // fclamp { z4.s-z7.s }, z29.s, z18.s\n"
-      ".inst 0xa062c324  // st1w { z4.s-z7.s }, p8, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+      "ld1rw { z17.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c24  // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      ".inst 0xc1b0ca3c  // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c73c  // st1w { z28.s-z31.s }, pn9.b, [x25]\n"
+      ".inst 0xc1b0ca24  // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
+      ".inst 0xa061c724  // st1w { z4.s-z7.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc1b0ca2c  // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+      ".inst 0xa062c32c  // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n"
       "addvl x25, x25, #12\n"
       "b 27f\n"
       "26:"  // Width 3: No activation
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
-      ".inst 0xa061c72c  // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
-      ".inst 0xc0062c44  // mova { z4.d-z7.d }, za.d[x9, #2]\n"
-      ".inst 0xa062c324  // st1w { z4.s-z7.s }, p8, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc0062c00  // mova { z0.d-z3.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c720  // st1w { z0.s-z3.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c730  // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c330  // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n"
       "addvl x25, x25, #12\n"
       "27:"  // Width 3: Output done
       "b 36f\n"
       "28:"  // Width 4
       "mov x20, #0x3\n"
       "mov x23, %x[A_ptr]\n"
-      "lsl x22, %x[K], #0x2\n"
+      "lsl x21, %x[K], #0x2\n"
       "msub x20, x10, x20, %x[N]\n"
-      "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       ".inst 0x25b467f0  // whilelt p8.s, XZR, x20, VLx4\n"
       "cbz x24, 29f\n"
-      ".inst 0xa040c700  // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n"
-      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
-      ".inst 0xa041c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
-      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
-      ".inst 0xa042c71c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
-      ".inst 0xc0042f82  // mova za.d[x9, #2], { z28.d-z31.d }\n"
-      ".inst 0xa043c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
-      ".inst 0xc0042e03  // mova za.d[x9, #3], { z16.d-z19.d }\n"
+      ".inst 0xa040c70c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x24]\n"
+      ".inst 0xc0042d80  // mova za.d[x9, #0], { z12.d-z15.d }\n"
+      ".inst 0xa041c70c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042d81  // mova za.d[x9, #1], { z12.d-z15.d }\n"
+      ".inst 0xa042c710  // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042e02  // mova za.d[x9, #2], { z16.d-z19.d }\n"
+      ".inst 0xa043c714  // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n"
+      ".inst 0xc0042e83  // mova za.d[x9, #3], { z20.d-z23.d }\n"
       "addvl x24, x24, #16\n"
       "b 30f\n"
       "29:"  // Width 4: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "30:"  // Width 4: setup done
-      "cmp x21, #0x8\n"
+      "cmp x22, #0x8\n"
       "ble 32f\n"
       "31:"  // Width 4: Multiply loop: Main loop head
-      "whilelt p1.s, XZR, x21\n"
-      "whilelt p0.s, x27, x21\n"
-      "ld1rqw { z0.s }, p1/Z, [x23]\n"
-      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
-      "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
-      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "sub x21, x21, #0x8\n"
-      "uzp1 z11.h, z11.h, z11.h\n"
-      "trn1 z0.d, z0.d, z11.d\n"
-      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
-      "cmp x21, #0x8\n"
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z6.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa8c6  // bfcvt z6.h, p2/M, z6.s\n"
+      "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aaa10  // bfcvt z16.h, p2/M, z16.s\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "sub x22, x22, #0x8\n"
+      "uzp1 z16.h, z16.h, z16.h\n"
+      "trn1 z6.d, z6.d, z16.d\n"
+      ".inst 0xa040a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n"
+      "cmp x22, #0x8\n"
       ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+      ".inst 0xc156b198  // bfdot za.s[x9, 0], { z12.h-z15.h }, z6.h[0]\n"
       "add x23, x23, #0x20\n"
-      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150b119  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
-      ".inst 0xa043a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc150b21a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xc150b39b  // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[0]\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
       ".inst 0xa042a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150b499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
-      ".inst 0xa043a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc150b59a  // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
+      ".inst 0xc156b119  // bfdot za.s[x9, 1], { z8.h-z11.h }, z6.h[0]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc156b19a  // bfdot za.s[x9, 2], { z12.h-z15.h }, z6.h[0]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xc150b79b  // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[1]\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
-      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150ba99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
-      ".inst 0xa043a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc150ba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
+      ".inst 0xc156b21b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z6.h[0]\n"
+      ".inst 0xa040a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc156b518  // bfdot za.s[x9, 0], { z8.h-z11.h }, z6.h[1]\n"
+      ".inst 0xa042a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc156b599  // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[1]\n"
+      ".inst 0xa043a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc156b41a  // bfdot za.s[x9, 2], { z0.h-z3.h }, z6.h[1]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xc150b99b  // bfdot za.s[x9, 3], { z12.h-z15.h }, z0.h[2]\n"
+      ".inst 0xc156b69b  // bfdot za.s[x9, 3], { z20.h-z23.h }, z6.h[1]\n"
+      ".inst 0xa040a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa041a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc156b918  // bfdot za.s[x9, 0], { z8.h-z11.h }, z6.h[2]\n"
+      ".inst 0xa042a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc156b999  // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[2]\n"
+      ".inst 0xa043a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc156b91a  // bfdot za.s[x9, 2], { z8.h-z11.h }, z6.h[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc156ba9b  // bfdot za.s[x9, 3], { z20.h-z23.h }, z6.h[2]\n"
       ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
-      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150bd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
-      ".inst 0xa043a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc150be1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
+      ".inst 0xa041a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc156bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z6.h[3]\n"
+      ".inst 0xa042a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc156bd99  // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[3]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc156bf1a  // bfdot za.s[x9, 2], { z24.h-z27.h }, z6.h[3]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xc150bf9b  // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[3]\n"
+      ".inst 0xc156be1b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z6.h[3]\n"
       "bgt 31b\n"
       "32:"  // Width 4: Multiply loop: Single iteration only
-      "whilelt p1.s, XZR, x21\n"
-      "whilelt p0.s, x27, x21\n"
-      "ld1rqw { z0.s }, p1/Z, [x23]\n"
-      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
-      "ld1rqw { z11.s }, p0/Z, [x23, #16]\n"
-      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "subs x21, x21, #0x2\n"
-      "uzp1 z11.h, z11.h, z11.h\n"
-      "trn1 z0.d, z0.d, z11.d\n"
-      ".inst 0xa040a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n"
+      "whilelt p1.s, XZR, x22\n"
+      "whilelt p0.s, x27, x22\n"
+      "ld1rqw { z15.s }, p1/Z, [x23]\n"
+      ".inst 0x658aa9ef  // bfcvt z15.h, p2/M, z15.s\n"
+      "ld1rqw { z16.s }, p0/Z, [x23, #16]\n"
+      ".inst 0x658aaa10  // bfcvt z16.h, p2/M, z16.s\n"
+      "uzp1 z15.h, z15.h, z15.h\n"
+      "subs x22, x22, #0x2\n"
+      "uzp1 z16.h, z16.h, z16.h\n"
+      "trn1 z15.d, z15.d, z16.d\n"
+      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
       "add x23, x23, #0x20\n"
-      ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
-      ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150b119  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
-      ".inst 0xa043a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc150b21a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xc150b39b  // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[0]\n"
-      "ble 33f\n"
-      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
       ".inst 0xa041a745  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150b499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
-      ".inst 0xa042a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150b59a  // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
-      ".inst 0xa043a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc150b79b  // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[1]\n"
+      ".inst 0xc15fb318  // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[0]\n"
+      ".inst 0xa042a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fb099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[0]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15fb01a  // bfdot za.s[x9, 2], { z0.h-z3.h }, z15.h[0]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xc15fb21b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z15.h[0]\n"
+      "ble 33f\n"
+      ".inst 0xa040a759  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fb718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[1]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fb619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[1]\n"
+      ".inst 0xa042a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc15fb69a  // bfdot za.s[x9, 2], { z20.h-z23.h }, z15.h[1]\n"
+      ".inst 0xa043a741  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15fb41b  // bfdot za.s[x9, 3], { z0.h-z3.h }, z15.h[1]\n"
       "addvl x26, x26, #16\n"
       "ble 33f\n"
       ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
-      "subs x21, x21, #0x2\n"
-      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
-      ".inst 0xa041a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150ba99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+      "subs x22, x22, #0x2\n"
+      ".inst 0xc15fba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[2]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[2]\n"
       ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150ba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
-      ".inst 0xa043a74d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc150b99b  // bfdot za.s[x9, 3], { z12.h-z15.h }, z0.h[2]\n"
+      ".inst 0xc15fba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[2]\n"
+      ".inst 0xa043a755  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15fba9b  // bfdot za.s[x9, 3], { z20.h-z23.h }, z15.h[2]\n"
       "addvl x26, x26, #16\n"
       "ble 33f\n"
-      ".inst 0xa040a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n"
-      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
-      ".inst 0xa041a749  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc150bd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+      ".inst 0xa040a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xc15fbe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[3]\n"
+      ".inst 0xa041a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc15fbe19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[3]\n"
       ".inst 0xa042a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc150be1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
-      ".inst 0xa043a75d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc150bf9b  // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[3]\n"
+      ".inst 0xc15fbe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[3]\n"
+      ".inst 0xa043a751  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc15fbe1b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z15.h[3]\n"
       "addvl x26, x26, #16\n"
       "33:"  // Width 4: Multiply loop: multiply skip
       "tbz %x[flags], #1, 34f\n"
       "add x21, %x[args_ptr], %[offset_min]\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      "ld1rw { z29.s }, p2/Z, [x21]\n"
-      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
-      "ld1rw { z18.s }, p2/Z, [x20]\n"
-      ".inst 0xc1b2cba8  // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
-      ".inst 0xc0062c44  // mova { z4.d-z7.d }, za.d[x9, #2]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc1b2cbac  // fclamp { z12.s-z15.s }, z29.s, z18.s\n"
-      ".inst 0xc0062c60  // mova { z0.d-z3.d }, za.d[x9, #3]\n"
-      ".inst 0xa061c72c  // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
-      ".inst 0xc1b2cba4  // fclamp { z4.s-z7.s }, z29.s, z18.s\n"
-      ".inst 0xa062c724  // st1w { z4.s-z7.s }, pn9.b, [x25, #0x8, MUL VL]\n"
-      ".inst 0xc1b2cba0  // fclamp { z0.s-z3.s }, z29.s, z18.s\n"
-      ".inst 0xa063c320  // st1w { z0.s-z3.s }, p8, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      "ld1rw { z21.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c38  // mova { z24.d-z27.d }, za.d[x9, #1]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c72c  // st1w { z12.s-z15.s }, pn9.b, [x25]\n"
+      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc0062c70  // mova { z16.d-z19.d }, za.d[x9, #3]\n"
+      ".inst 0xa061c738  // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc1b4caa0  // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+      ".inst 0xa062c720  // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      ".inst 0xa063c330  // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n"
       "addvl x25, x25, #16\n"
       "b 35f\n"
       "34:"  // Width 4: No activation
-      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
-      ".inst 0xa060c728  // st1w { z8.s-z11.s }, pn9.b, [x25]\n"
-      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
-      ".inst 0xa061c72c  // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n"
-      ".inst 0xc0062c44  // mova { z4.d-z7.d }, za.d[x9, #2]\n"
-      ".inst 0xa062c724  // st1w { z4.s-z7.s }, pn9.b, [x25, #0x8, MUL VL]\n"
-      ".inst 0xc0062c60  // mova { z0.d-z3.d }, za.d[x9, #3]\n"
-      ".inst 0xa063c320  // st1w { z0.s-z3.s }, p8, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc0062c10  // mova { z16.d-z19.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c730  // st1w { z16.s-z19.s }, pn9.b, [x25]\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c730  // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc0062c54  // mova { z20.d-z23.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c734  // st1w { z20.s-z23.s }, pn9.b, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc0062c78  // mova { z24.d-z27.d }, za.d[x9, #3]\n"
+      ".inst 0xa063c338  // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n"
       "addvl x25, x25, #16\n"
       "35:"  // Width 4: Output done
       "subs x28, x28, #0x4\n"
@@ -598,7 +598,7 @@
       "bgt 4b\n"
       "36:"  // Exit
       ".inst 0xd503467f  // SMSTOP\n"
-      "ptrue p2.b\n"
+      "ptrue p8.b\n"
       : [N] "+&r" (N)
       : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -607,5 +607,4 @@
 
 } // namespace arm_gemm
 
-#endif // ARM_COMPUTE_ENABLE_SME2
-#endif
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp
index 4c9f9cf..65e4667 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,19 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
-#ifdef __aarch64__
+#if defined(ARM_COMPUTE_ENABLE_SME2)
 #include "../std_transforms_sme.hpp"
 
 #define ARGLIST  \
@@ -83,4 +82,4 @@
 
 #undef ARGLIST
 
-#endif // __aarch64__
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
index 26dc0b9..86bd8ae 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
-#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
 
 #include "arm_gemm.hpp"
 #include "../../utils.hpp"
@@ -35,11 +35,9 @@
 void sme2_gemv_s8qa_dot_16VL (
     const int8_t *A_ptr, const int8_t *B_ptr, int8_t *output_ptr,
     size_t N, size_t K,
-    const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
 )
 {
-    ARM_COMPUTE_UNUSED(col_base);
-
     struct KernelArgs {
         const int8_t *B_ptr = {};
         size_t output_offset = {};
@@ -52,7 +50,7 @@
         flags |= 0x20;
     }
     __asm__ __volatile__(
-      "ptrue p2.b\n"
+      "ptrue p8.b\n"
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntw x28, ALL, MUL #4\n"
       "add x27, %x[N], x28\n"
@@ -84,8 +82,8 @@
       ".inst 0xf8b64b5a  // rprfm pldonce, x22, [x26]\n"
       "3:"  // RHS prefetch exit
       "mov x24, %x[col_bias]\n"
-      "mov z26.s, #0x0\n"
-      "mov z24.b, #0x1\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.b, #0x1\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "4:"  // Column loop
       "cmp x27, #0x4\n"
@@ -94,404 +92,404 @@
       "bgt 24f\n"
       "beq 14f\n"
       "mov x23, %x[A_ptr]\n"
-      "mov x22, %x[K]\n"
-      "mov x20, %x[N]\n"
       "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "mov x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       "whilelt p1.b, XZR, x20\n"
       "cbz x24, 5f\n"
-      ".inst 0xa040c304  // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
-      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa040c300  // ld1w { z0.s-z3.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
       "b 6f\n"
       "5:"  // Width 1: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "6:"  // Width 1: setup done
-      "cmp x21, #0x10\n"
+      "cmp x22, #0x10\n"
       "ble 9f\n"
       "7:"  // Width 1: Multiply loop: Main loop head
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
-      "addvl x26, x26, #16\n"
       ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      ".inst 0xc151b2a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b5a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b9a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bda0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "tbnz %x[flags], #31, 8f\n"
-      "sdot z26.s, z3.b, z24.b\n"
+      "sdot z28.s, z1.b, z29.b\n"
       "8:"  // Width 1: Multiply loop: unique 1: skip row sum
-      "sub x21, x21, #0x10\n"
-      "cmp x21, #0x10\n"
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
       "bgt 7b\n"
       "9:"  // Width 1: Multiply loop: Single iteration only
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      ".inst 0xc151b1a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
       "addvl x26, x26, #16\n"
       "ble 10f\n"
       ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
       "addvl x26, x26, #16\n"
       "ble 10f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0408349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b920  // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
       "addvl x26, x26, #16\n"
       "ble 10f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0408349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bd20  // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "10:"  // Width 1: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 11f\n"
-      "sdot z26.s, z3.b, z24.b\n"
+      "sdot z28.s, z1.b, z29.b\n"
       "11:"  // Width 1: Multiply loop: unique 2: skip row sum
       "tbnz %x[flags], #31, 12f\n"
       "add x21, %x[qp], %[b_offset]\n"
       "mov x20, #0x4\n"
-      "ld1rw { z10.s }, p2/Z, [x21]\n"
-      "neg z10.s, p2/M, z10.s\n"
+      "ld1rw { z26.s }, p2/Z, [x21]\n"
+      "neg z26.s, p2/M, z26.s\n"
       "whilelt p0.s, XZR, x20\n"
-      "saddv d26, p0, z26.s\n"
-      "mov z26.s, z26.s[0]\n"
-      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "saddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z26.s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "12:"  // Width 1: skip row sum fixup
-      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
-      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z7.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[c_offset]\n"
       "add x21, %x[qp], %[minval]\n"
-      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z6.s }, p2/Z, [x20]\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[maxval]\n"
-      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z21.s }, p2/Z, [x21]\n"
-      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
-      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
-      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
-      "ld1rw { z16.s }, p2/Z, [x20]\n"
-      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
-      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
-      "uzp1 z28.h, z28.h, z29.h\n"
-      "uzp1 z29.h, z30.h, z31.h\n"
-      "uzp1 z28.b, z28.b, z29.b\n"
-      "st1b { z28.b }, p1, [x25]\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a1ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+      "ld1rw { z30.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a2ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+      ".inst 0xc1bece0c  // sclamp { z12.s-z15.s }, z16.s, z30.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "uzp1 z19.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z19.b\n"
+      "st1b { z12.b }, p1, [x25]\n"
       "addvl x25, x25, #1\n"
       "13:"  // Width 1: Output done
       "b 44f\n"
       "14:"  // Width 2
       "mov x23, %x[A_ptr]\n"
-      "mov x22, %x[K]\n"
-      "sub x20, %x[N], x28\n"
       "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "sub x20, %x[N], x28\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       "whilelt p1.b, XZR, x20\n"
       "cbz x24, 15f\n"
-      ".inst 0xa040c304  // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
-      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
-      ".inst 0xa041c314  // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
-      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
+      ".inst 0xa040c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042e00  // mova za.d[x9, #0], { z16.d-z19.d }\n"
+      ".inst 0xa041c318  // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042f01  // mova za.d[x9, #1], { z24.d-z27.d }\n"
       "b 16f\n"
       "15:"  // Width 2: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "16:"  // Width 2: setup done
-      "cmp x21, #0x10\n"
+      "cmp x22, #0x10\n"
       "ble 19f\n"
       "17:"  // Width 2: Multiply loop: Main loop head
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "add x23, x23, #0x10\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b1a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+      ".inst 0xa0418359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b321  // sdot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
       ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b0a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
-      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b521  // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b8a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xc151b620  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
       ".inst 0xa0418355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153bea1  // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xc151b6a1  // sdot za.s[x9, 1], { z20.b-z23.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b9a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b9a1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bca0  // sdot za.s[x9, 0], { z4.b-z7.b }, z1.b[3]\n"
+      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151bd21  // sdot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "tbnz %x[flags], #31, 18f\n"
-      "sdot z26.s, z3.b, z24.b\n"
+      "sdot z28.s, z1.b, z29.b\n"
       "18:"  // Width 2: Multiply loop: unique 3: skip row sum
-      "sub x21, x21, #0x10\n"
-      "cmp x21, #0x10\n"
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
       "bgt 17b\n"
       "19:"  // Width 2: Multiply loop: Single iteration only
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b0a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      ".inst 0xc151b320  // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b221  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
       "addvl x26, x26, #16\n"
       "ble 20f\n"
       ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5a1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 20f\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b9a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b9a1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 20f\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bf20  // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
       ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b521  // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
-      "addvl x26, x26, #16\n"
-      "ble 20f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b8a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
-      "addvl x26, x26, #16\n"
-      "ble 20f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
-      ".inst 0xa0418355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153bea1  // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xc151bd21  // sdot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "20:"  // Width 2: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 21f\n"
-      "sdot z26.s, z3.b, z24.b\n"
+      "sdot z28.s, z1.b, z29.b\n"
       "21:"  // Width 2: Multiply loop: unique 4: skip row sum
       "tbnz %x[flags], #31, 22f\n"
       "add x21, %x[qp], %[b_offset]\n"
       "mov x20, #0x4\n"
-      "ld1rw { z10.s }, p2/Z, [x21]\n"
-      "neg z10.s, p2/M, z10.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      "neg z16.s, p2/M, z16.s\n"
       "whilelt p0.s, XZR, x20\n"
-      "saddv d26, p0, z26.s\n"
-      "mov z26.s, z26.s[0]\n"
-      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "saddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z16.s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "22:"  // Width 2: skip row sum fixup
-      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
+      "ld1rw { z6.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
-      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z5.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[c_offset]\n"
       "add x21, %x[qp], %[minval]\n"
-      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z6.s }, p2/Z, [x20]\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z9.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[maxval]\n"
-      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z21.s }, p2/Z, [x21]\n"
-      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
-      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
-      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
-      ".inst 0xc1a5ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
-      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
-      "ld1rw { z16.s }, p2/Z, [x20]\n"
-      ".inst 0xc1a4aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
-      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
-      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
-      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
-      ".inst 0xc1b0ceac  // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
-      "uzp1 z28.h, z28.h, z29.h\n"
-      "uzp1 z29.h, z30.h, z31.h\n"
-      "uzp1 z12.h, z12.h, z13.h\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
-      "uzp1 z28.b, z28.b, z29.b\n"
-      "st1b { z28.b }, p2, [x25]\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "st1b { z12.b }, p1, [x25, #1, MUL VL]\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c18  // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+      ".inst 0xc0062c20  // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+      ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+      ".inst 0xc1a5aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a5aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+      ".inst 0xc1a9ab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+      ".inst 0xc1a9ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+      ".inst 0xc1b5ce18  // sclamp { z24.s-z27.s }, z16.s, z21.s\n"
+      ".inst 0xc1b5ce00  // sclamp { z0.s-z3.s }, z16.s, z21.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "uzp1 z9.h, z26.h, z27.h\n"
+      "uzp1 z0.h, z0.h, z1.h\n"
+      "uzp1 z26.h, z2.h, z3.h\n"
+      "uzp1 z24.b, z24.b, z9.b\n"
+      "st1b { z24.b }, p2, [x25]\n"
+      "uzp1 z0.b, z0.b, z26.b\n"
+      "st1b { z0.b }, p1, [x25, #1, MUL VL]\n"
       "addvl x25, x25, #2\n"
       "23:"  // Width 2: Output done
       "b 44f\n"
       "24:"  // Width 3
       "mov x20, #0x2\n"
       "mov x23, %x[A_ptr]\n"
-      "mov x22, %x[K]\n"
-      "msub x20, x28, x20, %x[N]\n"
       "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       "whilelt p1.b, XZR, x20\n"
       "cbz x24, 25f\n"
-      ".inst 0xa040c304  // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
-      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
-      ".inst 0xa041c314  // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
-      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
-      ".inst 0xa042c314  // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
-      ".inst 0xc0042e82  // mova za.d[x9, #2], { z20.d-z23.d }\n"
+      ".inst 0xa040c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042e00  // mova za.d[x9, #0], { z16.d-z19.d }\n"
+      ".inst 0xa041c30c  // ld1w { z12.s-z15.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042d81  // mova za.d[x9, #1], { z12.d-z15.d }\n"
+      ".inst 0xa042c318  // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042f02  // mova za.d[x9, #2], { z24.d-z27.d }\n"
       "b 26f\n"
       "25:"  // Width 3: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "26:"  // Width 3: setup done
-      "cmp x21, #0x10\n"
+      "cmp x22, #0x10\n"
       "ble 29f\n"
       "27:"  // Width 3: Multiply loop: Main loop head
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "add x23, x23, #0x10\n"
       ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b0a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      ".inst 0xc151b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b221  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
       ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b1a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+      ".inst 0xc151b1a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
-      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b521  // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
-      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b622  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b5a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5a1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa0428355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b6a2  // sdot za.s[x9, 2], { z20.b-z23.b }, z1.b[1]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0408349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b920  // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
       ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b8a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
-      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b9a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
-      ".inst 0xa0418355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153bea1  // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xc151b8a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z1.b[2]\n"
       ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153be22  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xc151ba22  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bf20  // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
+      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151bca1  // sdot za.s[x9, 1], { z4.b-z7.b }, z1.b[3]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151be22  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "tbnz %x[flags], #31, 28f\n"
-      "sdot z26.s, z3.b, z24.b\n"
+      "sdot z28.s, z1.b, z29.b\n"
       "28:"  // Width 3: Multiply loop: unique 5: skip row sum
-      "sub x21, x21, #0x10\n"
-      "cmp x21, #0x10\n"
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
       "bgt 27b\n"
       "29:"  // Width 3: Multiply loop: Single iteration only
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b0a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
-      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b1a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
-      "addvl x26, x26, #16\n"
-      "ble 30f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
-      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b521  // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      ".inst 0xc151b2a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b221  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
       ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b622  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+      ".inst 0xc151b222  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[0]\n"
       "addvl x26, x26, #16\n"
       "ble 30f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b8a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
-      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b9a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
-      "addvl x26, x26, #16\n"
-      "ble 30f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
-      ".inst 0xa0418355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153bea1  // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b720  // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5a1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
       ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153be22  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xc151b622  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 30f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151ba20  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151ba21  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0428355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151baa2  // sdot za.s[x9, 2], { z20.b-z23.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 30f\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bda0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151be21  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151bda2  // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "30:"  // Width 3: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 31f\n"
-      "sdot z26.s, z3.b, z24.b\n"
+      "sdot z28.s, z1.b, z29.b\n"
       "31:"  // Width 3: Multiply loop: unique 6: skip row sum
       "tbnz %x[flags], #31, 32f\n"
       "add x21, %x[qp], %[b_offset]\n"
       "mov x20, #0x4\n"
-      "ld1rw { z10.s }, p2/Z, [x21]\n"
-      "neg z10.s, p2/M, z10.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      "neg z16.s, p2/M, z16.s\n"
       "whilelt p0.s, XZR, x20\n"
-      "saddv d26, p0, z26.s\n"
-      "mov z26.s, z26.s[0]\n"
-      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "saddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z16.s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "32:"  // Width 3: skip row sum fixup
-      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
-      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[c_offset]\n"
       "add x21, %x[qp], %[minval]\n"
-      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z6.s }, p2/Z, [x20]\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z3.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[maxval]\n"
-      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z21.s }, p2/Z, [x21]\n"
-      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
-      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
-      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
-      ".inst 0xc1a5ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
-      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
-      ".inst 0xc1a5ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
-      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
-      "ld1rw { z16.s }, p2/Z, [x20]\n"
-      ".inst 0xc1a4aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
-      ".inst 0xc1a4aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
-      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
-      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
-      ".inst 0xc1a6ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
-      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
-      ".inst 0xc1b0ceac  // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
-      "uzp1 z28.h, z28.h, z29.h\n"
-      ".inst 0xc1b0cea0  // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
-      "uzp1 z29.h, z30.h, z31.h\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a2ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+      ".inst 0xc0062c24  // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xc1a2ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+      ".inst 0xc1a1aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a1aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+      ".inst 0xc1a1aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+      ".inst 0xc1a3ab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+      ".inst 0xc1a3ab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+      ".inst 0xc1a3ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+      ".inst 0xc1a0ce08  // sclamp { z8.s-z11.s }, z16.s, z0.s\n"
+      ".inst 0xc1a0ce04  // sclamp { z4.s-z7.s }, z16.s, z0.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      ".inst 0xc1a0ce0c  // sclamp { z12.s-z15.s }, z16.s, z0.s\n"
+      "uzp1 z18.h, z10.h, z11.h\n"
+      "uzp1 z4.h, z4.h, z5.h\n"
+      "uzp1 z17.h, z6.h, z7.h\n"
       "uzp1 z12.h, z12.h, z13.h\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
-      "uzp1 z0.h, z0.h, z1.h\n"
-      "uzp1 z1.h, z2.h, z3.h\n"
-      "uzp1 z28.b, z28.b, z29.b\n"
-      "st1b { z28.b }, p2, [x25]\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "st1b { z12.b }, p2, [x25, #1, MUL VL]\n"
-      "uzp1 z0.b, z0.b, z1.b\n"
-      "st1b { z0.b }, p1, [x25, #2, MUL VL]\n"
+      "uzp1 z16.h, z14.h, z15.h\n"
+      "uzp1 z8.b, z8.b, z18.b\n"
+      "st1b { z8.b }, p2, [x25]\n"
+      "uzp1 z4.b, z4.b, z17.b\n"
+      "st1b { z4.b }, p2, [x25, #1, MUL VL]\n"
+      "uzp1 z12.b, z12.b, z16.b\n"
+      "st1b { z12.b }, p1, [x25, #2, MUL VL]\n"
       "addvl x25, x25, #3\n"
       "33:"  // Width 3: Output done
       "b 44f\n"
       "34:"  // Width 4
       "mov x20, #0x3\n"
       "mov x23, %x[A_ptr]\n"
-      "mov x22, %x[K]\n"
-      "msub x20, x28, x20, %x[N]\n"
       "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       "whilelt p1.b, XZR, x20\n"
       "cbz x24, 35f\n"
-      ".inst 0xa040c304  // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
-      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
-      ".inst 0xa041c314  // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
-      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
-      ".inst 0xa042c314  // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
-      ".inst 0xc0042e82  // mova za.d[x9, #2], { z20.d-z23.d }\n"
+      ".inst 0xa040c314  // ld1w { z20.s-z23.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042e80  // mova za.d[x9, #0], { z20.d-z23.d }\n"
+      ".inst 0xa041c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
+      ".inst 0xa042c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042e02  // mova za.d[x9, #2], { z16.d-z19.d }\n"
       ".inst 0xa043c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0xc, MUL VL]\n"
       ".inst 0xc0042e03  // mova za.d[x9, #3], { z16.d-z19.d }\n"
       "addvl x24, x24, #16\n"
@@ -499,165 +497,165 @@
       "35:"  // Width 4: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "36:"  // Width 4: setup done
-      "cmp x21, #0x10\n"
+      "cmp x22, #0x10\n"
       "ble 39f\n"
       "37:"  // Width 4: Multiply loop: Main loop head
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "add x23, x23, #0x10\n"
       ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b0a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      ".inst 0xc151b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b221  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
       ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b1a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+      ".inst 0xc151b1a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
       ".inst 0xa043834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153b1a3  // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
+      ".inst 0xc151b1a3  // sdot za.s[x9, 3], { z12.b-z15.b }, z1.b[0]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
-      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b521  // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b620  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5a1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
       ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b622  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
-      ".inst 0xa043834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153b5a3  // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b8a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
-      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b9a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
-      ".inst 0xa043835d  // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153bba3  // sdot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
-      ".inst 0xa0418355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153bea1  // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
-      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153be22  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xc151b622  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
       ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153be23  // sdot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xc151b623  // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151ba20  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b9a1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151ba22  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151ba23  // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bda0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151bda1  // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[3]\n"
+      ".inst 0xa0428359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151bf22  // sdot za.s[x9, 2], { z24.b-z27.b }, z1.b[3]\n"
+      ".inst 0xa0438345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151bca3  // sdot za.s[x9, 3], { z4.b-z7.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "tbnz %x[flags], #31, 38f\n"
-      "sdot z26.s, z3.b, z24.b\n"
+      "sdot z28.s, z1.b, z29.b\n"
       "38:"  // Width 4: Multiply loop: unique 7: skip row sum
-      "sub x21, x21, #0x10\n"
-      "cmp x21, #0x10\n"
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
       "bgt 37b\n"
       "39:"  // Width 4: Multiply loop: Single iteration only
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b0a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
-      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b1a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
-      ".inst 0xa043834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153b1a3  // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
-      "addvl x26, x26, #16\n"
-      "ble 40f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
-      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b521  // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
-      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b622  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
-      ".inst 0xa043834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153b5a3  // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
-      "addvl x26, x26, #16\n"
-      "ble 40f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b8a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
-      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b9a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
-      ".inst 0xa043835d  // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153bba3  // sdot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
-      "addvl x26, x26, #16\n"
-      "ble 40f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
-      ".inst 0xa0418355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153bea1  // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
-      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153be22  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xc151b1a0  // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+      ".inst 0xa0418359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b321  // sdot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
+      ".inst 0xa0428349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b122  // sdot za.s[x9, 2], { z8.b-z11.b }, z1.b[0]\n"
       ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153be23  // sdot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xc151b223  // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b620  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b621  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b5a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa0438355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151b6a3  // sdot za.s[x9, 3], { z20.b-z23.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151ba20  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151ba21  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151ba22  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151ba23  // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151be20  // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151be21  // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151be22  // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151be23  // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "40:"  // Width 4: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 41f\n"
-      "sdot z26.s, z3.b, z24.b\n"
+      "sdot z28.s, z1.b, z29.b\n"
       "41:"  // Width 4: Multiply loop: unique 8: skip row sum
       "tbnz %x[flags], #31, 42f\n"
       "add x21, %x[qp], %[b_offset]\n"
       "mov x20, #0x4\n"
-      "ld1rw { z10.s }, p2/Z, [x21]\n"
-      "neg z10.s, p2/M, z10.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      "neg z16.s, p2/M, z16.s\n"
       "whilelt p0.s, XZR, x20\n"
-      "saddv d26, p0, z26.s\n"
-      "mov z26.s, z26.s[0]\n"
-      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "saddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z16.s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "42:"  // Width 4: skip row sum fixup
-      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
+      "ld1rw { z11.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
-      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z7.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[c_offset]\n"
       "add x21, %x[qp], %[minval]\n"
-      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
       "ld1rw { z6.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[maxval]\n"
-      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z21.s }, p2/Z, [x21]\n"
-      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
-      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
-      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
-      ".inst 0xc1a5ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
-      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
-      ".inst 0xc1a5ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
-      ".inst 0xc0062c68  // mova { z8.d-z11.d }, za.d[x9, #3]\n"
-      ".inst 0xc1a5ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z5.s\n"
-      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
-      "ld1rw { z16.s }, p2/Z, [x20]\n"
-      ".inst 0xc1a4aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
-      ".inst 0xc1a4aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
-      ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
-      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z3.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c18  // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+      ".inst 0xc1abac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z11.s\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      ".inst 0xc1abac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+      ".inst 0xc0062c54  // mova { z20.d-z23.d }, za.d[x9, #2]\n"
+      ".inst 0xc1abac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+      ".inst 0xc0062c6c  // mova { z12.d-z15.d }, za.d[x9, #3]\n"
+      ".inst 0xc1abac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+      ".inst 0xc1a7aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+      "ld1rw { z31.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a7aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+      ".inst 0xc1a7aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+      ".inst 0xc1a6ab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+      ".inst 0xc1a6ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+      ".inst 0xc1a6ab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
       ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
-      ".inst 0xc1a6ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
-      ".inst 0xc1a6ab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z6.s\n"
-      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
-      ".inst 0xc1b0ceac  // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
-      "uzp1 z28.h, z28.h, z29.h\n"
-      ".inst 0xc1b0cea0  // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
-      ".inst 0xc1b0cea8  // sclamp { z8.s-z11.s }, z21.s, z16.s\n"
-      "uzp1 z29.h, z30.h, z31.h\n"
+      ".inst 0xc1bfcc78  // sclamp { z24.s-z27.s }, z3.s, z31.s\n"
+      ".inst 0xc1bfcc70  // sclamp { z16.s-z19.s }, z3.s, z31.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      ".inst 0xc1bfcc74  // sclamp { z20.s-z23.s }, z3.s, z31.s\n"
+      ".inst 0xc1bfcc6c  // sclamp { z12.s-z15.s }, z3.s, z31.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "uzp1 z18.h, z18.h, z19.h\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "uzp1 z17.h, z22.h, z23.h\n"
       "uzp1 z12.h, z12.h, z13.h\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
-      "uzp1 z0.h, z0.h, z1.h\n"
-      "uzp1 z1.h, z2.h, z3.h\n"
-      "uzp1 z8.h, z8.h, z9.h\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z28.b, z28.b, z29.b\n"
-      "st1b { z28.b }, p2, [x25]\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "st1b { z12.b }, p2, [x25, #1, MUL VL]\n"
-      "uzp1 z0.b, z0.b, z1.b\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "st1b { z0.b }, p2, [x25, #2, MUL VL]\n"
-      "st1b { z8.b }, p1, [x25, #3, MUL VL]\n"
+      "uzp1 z30.h, z14.h, z15.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p2, [x25]\n"
+      "uzp1 z16.b, z16.b, z18.b\n"
+      "st1b { z16.b }, p2, [x25, #1, MUL VL]\n"
+      "uzp1 z20.b, z20.b, z17.b\n"
+      "uzp1 z12.b, z12.b, z30.b\n"
+      "st1b { z20.b }, p2, [x25, #2, MUL VL]\n"
+      "st1b { z12.b }, p1, [x25, #3, MUL VL]\n"
       "addvl x25, x25, #4\n"
       "43:"  // Width 4: Output done
       "subs x27, x27, #0x4\n"
@@ -665,7 +663,7 @@
       "bgt 4b\n"
       "44:"  // Exit
       ".inst 0xd503467f  // SMSTOP\n"
-      "ptrue p2.b\n"
+      "ptrue p8.b\n"
       : [N] "+&r" (N), [flags] "+&r" (flags)
       : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [output_ptr] "r" (output_ptr), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -674,5 +672,4 @@
 
 } // namespace arm_gemm
 
-#endif // ARM_COMPUTE_ENABLE_SME2
-#endif
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp
index e15b954..46d8c44 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,19 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
-#ifdef __aarch64__
+#if defined(ARM_COMPUTE_ENABLE_SME2)
 #include "../std_transforms_sme.hpp"
 
 #define ARGLIST  \
@@ -83,4 +82,4 @@
 
 #undef ARGLIST
 
-#endif // __aarch64__
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
index dfdc4ea..093feee 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
-#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
 
 #include "arm_gemm.hpp"
 #include "../../utils.hpp"
@@ -35,11 +35,9 @@
 void sme2_gemv_u8qa_dot_16VL (
     const uint8_t *A_ptr, const uint8_t *B_ptr, uint8_t *output_ptr,
     size_t N, size_t K,
-    const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
 )
 {
-    ARM_COMPUTE_UNUSED(col_base);
-
     struct KernelArgs {
         const uint8_t *B_ptr = {};
         size_t output_offset = {};
@@ -52,7 +50,7 @@
         flags |= 0x20;
     }
     __asm__ __volatile__(
-      "ptrue p2.b\n"
+      "ptrue p8.b\n"
       ".inst 0xd503477f  // SMSTART ZA\n"
       "cntw x28, ALL, MUL #4\n"
       "add x27, %x[N], x28\n"
@@ -84,8 +82,8 @@
       ".inst 0xf8b64b5a  // rprfm pldonce, x22, [x26]\n"
       "3:"  // RHS prefetch exit
       "mov x24, %x[col_bias]\n"
-      "mov z26.s, #0x0\n"
-      "mov z24.b, #0x1\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.b, #0x1\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "4:"  // Column loop
       "cmp x27, #0x4\n"
@@ -94,404 +92,404 @@
       "bgt 24f\n"
       "beq 14f\n"
       "mov x23, %x[A_ptr]\n"
-      "mov x22, %x[K]\n"
-      "mov x20, %x[N]\n"
       "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "mov x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       "whilelt p1.b, XZR, x20\n"
       "cbz x24, 5f\n"
-      ".inst 0xa040c304  // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
-      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa040c300  // ld1w { z0.s-z3.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
       "b 6f\n"
       "5:"  // Width 1: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "6:"  // Width 1: setup done
-      "cmp x21, #0x10\n"
+      "cmp x22, #0x10\n"
       "ble 9f\n"
       "7:"  // Width 1: Multiply loop: Main loop head
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
-      "addvl x26, x26, #16\n"
       ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      ".inst 0xc151b2b0  // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b5b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b9b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bdb0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "tbnz %x[flags], #31, 8f\n"
-      "udot z26.s, z3.b, z24.b\n"
+      "udot z28.s, z1.b, z29.b\n"
       "8:"  // Width 1: Multiply loop: unique 1: skip row sum
-      "sub x21, x21, #0x10\n"
-      "cmp x21, #0x10\n"
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
       "bgt 7b\n"
       "9:"  // Width 1: Multiply loop: Single iteration only
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      ".inst 0xc151b1b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
       "addvl x26, x26, #16\n"
       "ble 10f\n"
       ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
       "addvl x26, x26, #16\n"
       "ble 10f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0408349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b930  // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
       "addvl x26, x26, #16\n"
       "ble 10f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0408349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bd30  // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "10:"  // Width 1: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 11f\n"
-      "udot z26.s, z3.b, z24.b\n"
+      "udot z28.s, z1.b, z29.b\n"
       "11:"  // Width 1: Multiply loop: unique 2: skip row sum
       "tbnz %x[flags], #31, 12f\n"
       "add x21, %x[qp], %[b_offset]\n"
       "mov x20, #0x4\n"
-      "ld1rw { z10.s }, p2/Z, [x21]\n"
-      "neg z10.s, p2/M, z10.s\n"
+      "ld1rw { z26.s }, p2/Z, [x21]\n"
+      "neg z26.s, p2/M, z26.s\n"
       "whilelt p0.s, XZR, x20\n"
-      "uaddv d26, p0, z26.s\n"
-      "mov z26.s, z26.s[0]\n"
-      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "uaddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z26.s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "12:"  // Width 1: skip row sum fixup
-      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
-      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z7.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[c_offset]\n"
       "add x21, %x[qp], %[minval]\n"
-      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z6.s }, p2/Z, [x20]\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[maxval]\n"
-      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z21.s }, p2/Z, [x21]\n"
-      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
-      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
-      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
-      "ld1rw { z16.s }, p2/Z, [x20]\n"
-      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
-      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
-      "uzp1 z28.h, z28.h, z29.h\n"
-      "uzp1 z29.h, z30.h, z31.h\n"
-      "uzp1 z28.b, z28.b, z29.b\n"
-      "st1b { z28.b }, p1, [x25]\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c0c  // mova { z12.d-z15.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a1ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+      "ld1rw { z30.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a2ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+      ".inst 0xc1bece0c  // sclamp { z12.s-z15.s }, z16.s, z30.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "uzp1 z19.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z19.b\n"
+      "st1b { z12.b }, p1, [x25]\n"
       "addvl x25, x25, #1\n"
       "13:"  // Width 1: Output done
       "b 44f\n"
       "14:"  // Width 2
       "mov x23, %x[A_ptr]\n"
-      "mov x22, %x[K]\n"
-      "sub x20, %x[N], x28\n"
       "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "sub x20, %x[N], x28\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       "whilelt p1.b, XZR, x20\n"
       "cbz x24, 15f\n"
-      ".inst 0xa040c304  // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
-      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
-      ".inst 0xa041c314  // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
-      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
+      ".inst 0xa040c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042e00  // mova za.d[x9, #0], { z16.d-z19.d }\n"
+      ".inst 0xa041c318  // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042f01  // mova za.d[x9, #1], { z24.d-z27.d }\n"
       "b 16f\n"
       "15:"  // Width 2: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "16:"  // Width 2: setup done
-      "cmp x21, #0x10\n"
+      "cmp x22, #0x10\n"
       "ble 19f\n"
       "17:"  // Width 2: Multiply loop: Main loop head
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "add x23, x23, #0x10\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b1b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+      ".inst 0xa0418359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b331  // udot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
       ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b0b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
-      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b531  // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b8b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xc151b630  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
       ".inst 0xa0418355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153beb1  // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xc151b6b1  // udot za.s[x9, 1], { z20.b-z23.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b9b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b9b1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bcb0  // udot za.s[x9, 0], { z4.b-z7.b }, z1.b[3]\n"
+      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151bd31  // udot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "tbnz %x[flags], #31, 18f\n"
-      "udot z26.s, z3.b, z24.b\n"
+      "udot z28.s, z1.b, z29.b\n"
       "18:"  // Width 2: Multiply loop: unique 3: skip row sum
-      "sub x21, x21, #0x10\n"
-      "cmp x21, #0x10\n"
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
       "bgt 17b\n"
       "19:"  // Width 2: Multiply loop: Single iteration only
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b0b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      ".inst 0xc151b330  // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b231  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
       "addvl x26, x26, #16\n"
       "ble 20f\n"
       ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5b1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 20f\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b9b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b9b1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 20f\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bf30  // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
       ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b531  // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
-      "addvl x26, x26, #16\n"
-      "ble 20f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b8b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
-      "addvl x26, x26, #16\n"
-      "ble 20f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
-      ".inst 0xa0418355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153beb1  // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xc151bd31  // udot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "20:"  // Width 2: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 21f\n"
-      "udot z26.s, z3.b, z24.b\n"
+      "udot z28.s, z1.b, z29.b\n"
       "21:"  // Width 2: Multiply loop: unique 4: skip row sum
       "tbnz %x[flags], #31, 22f\n"
       "add x21, %x[qp], %[b_offset]\n"
       "mov x20, #0x4\n"
-      "ld1rw { z10.s }, p2/Z, [x21]\n"
-      "neg z10.s, p2/M, z10.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      "neg z16.s, p2/M, z16.s\n"
       "whilelt p0.s, XZR, x20\n"
-      "uaddv d26, p0, z26.s\n"
-      "mov z26.s, z26.s[0]\n"
-      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "uaddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z16.s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "22:"  // Width 2: skip row sum fixup
-      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
+      "ld1rw { z6.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
-      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z5.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[c_offset]\n"
       "add x21, %x[qp], %[minval]\n"
-      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z6.s }, p2/Z, [x20]\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z9.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[maxval]\n"
-      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z21.s }, p2/Z, [x21]\n"
-      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
-      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
-      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
-      ".inst 0xc1a5ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
-      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
-      "ld1rw { z16.s }, p2/Z, [x20]\n"
-      ".inst 0xc1a4aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
-      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
-      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
-      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
-      ".inst 0xc1b0ceac  // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
-      "uzp1 z28.h, z28.h, z29.h\n"
-      "uzp1 z29.h, z30.h, z31.h\n"
-      "uzp1 z12.h, z12.h, z13.h\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
-      "uzp1 z28.b, z28.b, z29.b\n"
-      "st1b { z28.b }, p2, [x25]\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "st1b { z12.b }, p1, [x25, #1, MUL VL]\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c18  // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a6ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+      ".inst 0xc0062c20  // mova { z0.d-z3.d }, za.d[x9, #1]\n"
+      ".inst 0xc1a6ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+      ".inst 0xc1a5aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a5aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+      ".inst 0xc1a9ab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+      ".inst 0xc1a9ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+      ".inst 0xc1b5ce18  // sclamp { z24.s-z27.s }, z16.s, z21.s\n"
+      ".inst 0xc1b5ce00  // sclamp { z0.s-z3.s }, z16.s, z21.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "uzp1 z9.h, z26.h, z27.h\n"
+      "uzp1 z0.h, z0.h, z1.h\n"
+      "uzp1 z26.h, z2.h, z3.h\n"
+      "uzp1 z24.b, z24.b, z9.b\n"
+      "st1b { z24.b }, p2, [x25]\n"
+      "uzp1 z0.b, z0.b, z26.b\n"
+      "st1b { z0.b }, p1, [x25, #1, MUL VL]\n"
       "addvl x25, x25, #2\n"
       "23:"  // Width 2: Output done
       "b 44f\n"
       "24:"  // Width 3
       "mov x20, #0x2\n"
       "mov x23, %x[A_ptr]\n"
-      "mov x22, %x[K]\n"
-      "msub x20, x28, x20, %x[N]\n"
       "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       "whilelt p1.b, XZR, x20\n"
       "cbz x24, 25f\n"
-      ".inst 0xa040c304  // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
-      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
-      ".inst 0xa041c314  // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
-      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
-      ".inst 0xa042c314  // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
-      ".inst 0xc0042e82  // mova za.d[x9, #2], { z20.d-z23.d }\n"
+      ".inst 0xa040c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042e00  // mova za.d[x9, #0], { z16.d-z19.d }\n"
+      ".inst 0xa041c30c  // ld1w { z12.s-z15.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042d81  // mova za.d[x9, #1], { z12.d-z15.d }\n"
+      ".inst 0xa042c318  // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042f02  // mova za.d[x9, #2], { z24.d-z27.d }\n"
       "b 26f\n"
       "25:"  // Width 3: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "26:"  // Width 3: setup done
-      "cmp x21, #0x10\n"
+      "cmp x22, #0x10\n"
       "ble 29f\n"
       "27:"  // Width 3: Multiply loop: Main loop head
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "add x23, x23, #0x10\n"
       ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b0b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      ".inst 0xc151b230  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b231  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
       ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b1b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+      ".inst 0xc151b1b2  // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
-      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b531  // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
-      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b632  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b5b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5b1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa0428355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b6b2  // udot za.s[x9, 2], { z20.b-z23.b }, z1.b[1]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0408349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b930  // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n"
       ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b8b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
-      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b9b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
-      ".inst 0xa0418355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153beb1  // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xc151b8b1  // udot za.s[x9, 1], { z4.b-z7.b }, z1.b[2]\n"
       ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153be32  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xc151ba32  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bf30  // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n"
+      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151bcb1  // udot za.s[x9, 1], { z4.b-z7.b }, z1.b[3]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151be32  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "tbnz %x[flags], #31, 28f\n"
-      "udot z26.s, z3.b, z24.b\n"
+      "udot z28.s, z1.b, z29.b\n"
       "28:"  // Width 3: Multiply loop: unique 5: skip row sum
-      "sub x21, x21, #0x10\n"
-      "cmp x21, #0x10\n"
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
       "bgt 27b\n"
       "29:"  // Width 3: Multiply loop: Single iteration only
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b0b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
-      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b1b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
-      "addvl x26, x26, #16\n"
-      "ble 30f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
-      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b531  // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      ".inst 0xc151b2b0  // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b231  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
       ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b632  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+      ".inst 0xc151b232  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[0]\n"
       "addvl x26, x26, #16\n"
       "ble 30f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b8b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
-      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b9b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
-      "addvl x26, x26, #16\n"
-      "ble 30f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
-      ".inst 0xa0418355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153beb1  // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0408359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b730  // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5b1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
       ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153be32  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xc151b632  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 30f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151ba30  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151ba31  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0428355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151bab2  // udot za.s[x9, 2], { z20.b-z23.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 30f\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bdb0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151be31  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151bdb2  // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "30:"  // Width 3: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 31f\n"
-      "udot z26.s, z3.b, z24.b\n"
+      "udot z28.s, z1.b, z29.b\n"
       "31:"  // Width 3: Multiply loop: unique 6: skip row sum
       "tbnz %x[flags], #31, 32f\n"
       "add x21, %x[qp], %[b_offset]\n"
       "mov x20, #0x4\n"
-      "ld1rw { z10.s }, p2/Z, [x21]\n"
-      "neg z10.s, p2/M, z10.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      "neg z16.s, p2/M, z16.s\n"
       "whilelt p0.s, XZR, x20\n"
-      "uaddv d26, p0, z26.s\n"
-      "mov z26.s, z26.s[0]\n"
-      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "uaddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z16.s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "32:"  // Width 3: skip row sum fixup
-      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
-      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[c_offset]\n"
       "add x21, %x[qp], %[minval]\n"
-      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z6.s }, p2/Z, [x20]\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z3.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[maxval]\n"
-      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z21.s }, p2/Z, [x21]\n"
-      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
-      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
-      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
-      ".inst 0xc1a5ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
-      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
-      ".inst 0xc1a5ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
-      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
-      "ld1rw { z16.s }, p2/Z, [x20]\n"
-      ".inst 0xc1a4aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
-      ".inst 0xc1a4aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
-      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
-      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
-      ".inst 0xc1a6ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
-      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
-      ".inst 0xc1b0ceac  // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
-      "uzp1 z28.h, z28.h, z29.h\n"
-      ".inst 0xc1b0cea0  // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
-      "uzp1 z29.h, z30.h, z31.h\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a2ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+      ".inst 0xc0062c24  // mova { z4.d-z7.d }, za.d[x9, #1]\n"
+      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc0062c4c  // mova { z12.d-z15.d }, za.d[x9, #2]\n"
+      ".inst 0xc1a2ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+      ".inst 0xc1a1aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a1aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+      ".inst 0xc1a1aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+      ".inst 0xc1a3ab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+      ".inst 0xc1a3ab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+      ".inst 0xc1a3ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+      ".inst 0xc1a0ce08  // sclamp { z8.s-z11.s }, z16.s, z0.s\n"
+      ".inst 0xc1a0ce04  // sclamp { z4.s-z7.s }, z16.s, z0.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      ".inst 0xc1a0ce0c  // sclamp { z12.s-z15.s }, z16.s, z0.s\n"
+      "uzp1 z18.h, z10.h, z11.h\n"
+      "uzp1 z4.h, z4.h, z5.h\n"
+      "uzp1 z17.h, z6.h, z7.h\n"
       "uzp1 z12.h, z12.h, z13.h\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
-      "uzp1 z0.h, z0.h, z1.h\n"
-      "uzp1 z1.h, z2.h, z3.h\n"
-      "uzp1 z28.b, z28.b, z29.b\n"
-      "st1b { z28.b }, p2, [x25]\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "st1b { z12.b }, p2, [x25, #1, MUL VL]\n"
-      "uzp1 z0.b, z0.b, z1.b\n"
-      "st1b { z0.b }, p1, [x25, #2, MUL VL]\n"
+      "uzp1 z16.h, z14.h, z15.h\n"
+      "uzp1 z8.b, z8.b, z18.b\n"
+      "st1b { z8.b }, p2, [x25]\n"
+      "uzp1 z4.b, z4.b, z17.b\n"
+      "st1b { z4.b }, p2, [x25, #1, MUL VL]\n"
+      "uzp1 z12.b, z12.b, z16.b\n"
+      "st1b { z12.b }, p1, [x25, #2, MUL VL]\n"
       "addvl x25, x25, #3\n"
       "33:"  // Width 3: Output done
       "b 44f\n"
       "34:"  // Width 4
       "mov x20, #0x3\n"
       "mov x23, %x[A_ptr]\n"
-      "mov x22, %x[K]\n"
-      "msub x20, x28, x20, %x[N]\n"
       "mov x21, %x[K]\n"
-      ".inst 0xf8b64af8  // rprfm pldmany, x22, [x23]\n"
+      "msub x20, x28, x20, %x[N]\n"
+      "mov x22, %x[K]\n"
+      ".inst 0xf8b54af8  // rprfm pldmany, x21, [x23]\n"
       "whilelt p1.b, XZR, x20\n"
       "cbz x24, 35f\n"
-      ".inst 0xa040c304  // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n"
-      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
-      ".inst 0xa041c314  // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
-      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
-      ".inst 0xa042c314  // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
-      ".inst 0xc0042e82  // mova za.d[x9, #2], { z20.d-z23.d }\n"
+      ".inst 0xa040c314  // ld1w { z20.s-z23.s }, pn8.b/Z, [x24]\n"
+      ".inst 0xc0042e80  // mova za.d[x9, #0], { z20.d-z23.d }\n"
+      ".inst 0xa041c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
+      ".inst 0xa042c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0042e02  // mova za.d[x9, #2], { z16.d-z19.d }\n"
       ".inst 0xa043c310  // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0xc, MUL VL]\n"
       ".inst 0xc0042e03  // mova za.d[x9, #3], { z16.d-z19.d }\n"
       "addvl x24, x24, #16\n"
@@ -499,165 +497,165 @@
       "35:"  // Width 4: no bias
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "36:"  // Width 4: setup done
-      "cmp x21, #0x10\n"
+      "cmp x22, #0x10\n"
       "ble 39f\n"
       "37:"  // Width 4: Multiply loop: Main loop head
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "add x23, x23, #0x10\n"
       ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b0b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      ".inst 0xc151b230  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b231  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n"
       ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b1b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+      ".inst 0xc151b1b2  // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n"
       ".inst 0xa043834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153b1b3  // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
+      ".inst 0xc151b1b3  // udot za.s[x9, 3], { z12.b-z15.b }, z1.b[0]\n"
       "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
-      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b531  // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151b630  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b5b1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n"
       ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b632  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
-      ".inst 0xa043834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153b5b3  // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b8b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
-      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b9b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
-      ".inst 0xa043835d  // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153bbb3  // udot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
-      "addvl x26, x26, #16\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
-      ".inst 0xa0418355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153beb1  // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
-      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153be32  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xc151b632  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n"
       ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153be33  // udot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xc151b633  // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151ba30  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b9b1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151ba32  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151ba33  // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151bdb0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n"
+      ".inst 0xa041834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151bdb1  // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[3]\n"
+      ".inst 0xa0428359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151bf32  // udot za.s[x9, 2], { z24.b-z27.b }, z1.b[3]\n"
+      ".inst 0xa0438345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151bcb3  // udot za.s[x9, 3], { z4.b-z7.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "tbnz %x[flags], #31, 38f\n"
-      "udot z26.s, z3.b, z24.b\n"
+      "udot z28.s, z1.b, z29.b\n"
       "38:"  // Width 4: Multiply loop: unique 7: skip row sum
-      "sub x21, x21, #0x10\n"
-      "cmp x21, #0x10\n"
+      "sub x22, x22, #0x10\n"
+      "cmp x22, #0x10\n"
       "bgt 37b\n"
       "39:"  // Width 4: Multiply loop: Single iteration only
-      "whilelt p0.b, XZR, x21\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "whilelt p0.b, XZR, x22\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xa040834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b0b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
-      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b1b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
-      ".inst 0xa043834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153b1b3  // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
-      "addvl x26, x26, #16\n"
-      "ble 40f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
-      ".inst 0xa0418349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b531  // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
-      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b632  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
-      ".inst 0xa043834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153b5b3  // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
-      "addvl x26, x26, #16\n"
-      "ble 40f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      "subs x21, x21, #0x4\n"
-      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
-      ".inst 0xa0418345  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153b8b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
-      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153b9b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
-      ".inst 0xa043835d  // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153bbb3  // udot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
-      "addvl x26, x26, #16\n"
-      "ble 40f\n"
-      ".inst 0xa0408355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n"
-      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
-      ".inst 0xa0418355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
-      ".inst 0xc153beb1  // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
-      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
-      ".inst 0xc153be32  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xc151b1b0  // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n"
+      ".inst 0xa0418359  // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b331  // udot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n"
+      ".inst 0xa0428349  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b132  // udot za.s[x9, 2], { z8.b-z11.b }, z1.b[0]\n"
       ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
-      ".inst 0xc153be33  // udot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xc151b233  // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[0]\n"
+      "addvl x26, x26, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151b630  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151b631  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[1]\n"
+      ".inst 0xa042834d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151b5b2  // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[1]\n"
+      ".inst 0xa0438355  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151b6b3  // udot za.s[x9, 3], { z20.b-z23.b }, z1.b[1]\n"
+      "addvl x26, x26, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "subs x22, x22, #0x4\n"
+      ".inst 0xc151ba30  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151ba31  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151ba32  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151ba33  // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n"
+      "addvl x26, x26, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xc151be30  // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa0418351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xc151be31  // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa0428351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      ".inst 0xc151be32  // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n"
+      ".inst 0xa0438351  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      ".inst 0xc151be33  // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[3]\n"
       "addvl x26, x26, #16\n"
       "40:"  // Width 4: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 41f\n"
-      "udot z26.s, z3.b, z24.b\n"
+      "udot z28.s, z1.b, z29.b\n"
       "41:"  // Width 4: Multiply loop: unique 8: skip row sum
       "tbnz %x[flags], #31, 42f\n"
       "add x21, %x[qp], %[b_offset]\n"
       "mov x20, #0x4\n"
-      "ld1rw { z10.s }, p2/Z, [x21]\n"
-      "neg z10.s, p2/M, z10.s\n"
+      "ld1rw { z16.s }, p2/Z, [x21]\n"
+      "neg z16.s, p2/M, z16.s\n"
       "whilelt p0.s, XZR, x20\n"
-      "uaddv d26, p0, z26.s\n"
-      "mov z26.s, z26.s[0]\n"
-      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "uaddv d28, p0, z28.s\n"
+      "mov z28.s, z28.s[0]\n"
+      "mul z28.s, p2/M, z28.s, z16.s\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "42:"  // Width 4: skip row sum fixup
-      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      ".inst 0xc0904b80  // addha za0.s, p2/M, p2/M, z28.s\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
+      "ld1rw { z11.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
-      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      ".inst 0xc0904b81  // addha za1.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z7.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[c_offset]\n"
       "add x21, %x[qp], %[minval]\n"
-      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
+      ".inst 0xc0904b82  // addha za2.s, p2/M, p2/M, z28.s\n"
       "ld1rw { z6.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[maxval]\n"
-      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
-      "ld1rw { z21.s }, p2/Z, [x21]\n"
-      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
-      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
-      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
-      ".inst 0xc1a5ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
-      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
-      ".inst 0xc1a5ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
-      ".inst 0xc0062c68  // mova { z8.d-z11.d }, za.d[x9, #3]\n"
-      ".inst 0xc1a5ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z5.s\n"
-      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
-      "ld1rw { z16.s }, p2/Z, [x20]\n"
-      ".inst 0xc1a4aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
-      ".inst 0xc1a4aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
-      ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
-      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+      ".inst 0xc0904b83  // addha za3.s, p2/M, p2/M, z28.s\n"
+      "ld1rw { z3.s }, p2/Z, [x21]\n"
+      ".inst 0xc0062c18  // mova { z24.d-z27.d }, za.d[x9, #0]\n"
+      ".inst 0xc1abac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z11.s\n"
+      ".inst 0xc0062c30  // mova { z16.d-z19.d }, za.d[x9, #1]\n"
+      ".inst 0xc1abac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+      ".inst 0xc0062c54  // mova { z20.d-z23.d }, za.d[x9, #2]\n"
+      ".inst 0xc1abac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+      ".inst 0xc0062c6c  // mova { z12.d-z15.d }, za.d[x9, #3]\n"
+      ".inst 0xc1abac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+      ".inst 0xc1a7aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+      "ld1rw { z31.s }, p2/Z, [x20]\n"
+      ".inst 0xc1a7aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+      ".inst 0xc1a7aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+      ".inst 0xc1a6ab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+      ".inst 0xc1a6ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+      ".inst 0xc1a6ab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
       ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
-      ".inst 0xc1a6ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
-      ".inst 0xc1a6ab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z6.s\n"
-      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
-      ".inst 0xc1b0ceac  // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
-      "uzp1 z28.h, z28.h, z29.h\n"
-      ".inst 0xc1b0cea0  // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
-      ".inst 0xc1b0cea8  // sclamp { z8.s-z11.s }, z21.s, z16.s\n"
-      "uzp1 z29.h, z30.h, z31.h\n"
+      ".inst 0xc1bfcc78  // sclamp { z24.s-z27.s }, z3.s, z31.s\n"
+      ".inst 0xc1bfcc70  // sclamp { z16.s-z19.s }, z3.s, z31.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      ".inst 0xc1bfcc74  // sclamp { z20.s-z23.s }, z3.s, z31.s\n"
+      ".inst 0xc1bfcc6c  // sclamp { z12.s-z15.s }, z3.s, z31.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "uzp1 z18.h, z18.h, z19.h\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "uzp1 z17.h, z22.h, z23.h\n"
       "uzp1 z12.h, z12.h, z13.h\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
-      "uzp1 z0.h, z0.h, z1.h\n"
-      "uzp1 z1.h, z2.h, z3.h\n"
-      "uzp1 z8.h, z8.h, z9.h\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z28.b, z28.b, z29.b\n"
-      "st1b { z28.b }, p2, [x25]\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "st1b { z12.b }, p2, [x25, #1, MUL VL]\n"
-      "uzp1 z0.b, z0.b, z1.b\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "st1b { z0.b }, p2, [x25, #2, MUL VL]\n"
-      "st1b { z8.b }, p1, [x25, #3, MUL VL]\n"
+      "uzp1 z30.h, z14.h, z15.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p2, [x25]\n"
+      "uzp1 z16.b, z16.b, z18.b\n"
+      "st1b { z16.b }, p2, [x25, #1, MUL VL]\n"
+      "uzp1 z20.b, z20.b, z17.b\n"
+      "uzp1 z12.b, z12.b, z30.b\n"
+      "st1b { z20.b }, p2, [x25, #2, MUL VL]\n"
+      "st1b { z12.b }, p1, [x25, #3, MUL VL]\n"
       "addvl x25, x25, #4\n"
       "43:"  // Width 4: Output done
       "subs x27, x27, #0x4\n"
@@ -665,7 +663,7 @@
       "bgt 4b\n"
       "44:"  // Exit
       ".inst 0xd503467f  // SMSTOP\n"
-      "ptrue p2.b\n"
+      "ptrue p8.b\n"
       : [N] "+&r" (N), [flags] "+&r" (flags)
       : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [output_ptr] "r" (output_ptr), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -674,5 +672,4 @@
 
 } // namespace arm_gemm
 
-#endif // ARM_COMPUTE_ENABLE_SME2
-#endif // __ARM_FEATURE_SVE
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
index 37eb63d..edfb362 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "../bfloat.hpp"
 #include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 1, 4, 2> transforms = {};
 
-  cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
index c6eb858..8105300 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -113,12 +112,12 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "1:"  // Initial accumulator load from buffer: Loop
-      ".inst 0xa040c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
-      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xa041c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
-      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa042c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
-      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa040c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c5d8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa042c5d4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
       ".inst 0xa043c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
       ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
       "add x12, x12, #0x4\n"
@@ -138,12 +137,12 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      "fmov z21.s, #1.0\n"
+      "fmov z6.s, #1.0\n"
       ".inst 0xa009c29d  // ldnt1w { z28.s-z31.s }, p8/Z, [x20, x9, LSL #2]\n"
-      ".inst 0x809c02a0  // fmopa za0.s, p0/M, p0/M, z21.s, z28.s\n"
-      ".inst 0x809d02a1  // fmopa za1.s, p0/M, p0/M, z21.s, z29.s\n"
-      ".inst 0x809e02a2  // fmopa za2.s, p0/M, p0/M, z21.s, z30.s\n"
-      ".inst 0x809f02a3  // fmopa za3.s, p0/M, p0/M, z21.s, z31.s\n"
+      ".inst 0x809c00c0  // fmopa za0.s, p0/M, p0/M, z6.s, z28.s\n"
+      ".inst 0x809d00c1  // fmopa za1.s, p0/M, p0/M, z6.s, z29.s\n"
+      ".inst 0x809e00c2  // fmopa za2.s, p0/M, p0/M, z6.s, z30.s\n"
+      ".inst 0x809f00c3  // fmopa za3.s, p0/M, p0/M, z6.s, z31.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x9\n"
       "mov x21, x10\n"
@@ -166,75 +165,75 @@
       "madd x23, x9, x20, x23\n"  // bptr = B + n * kstride_bytes
       "cbz x22, 8f\n"
       "subs x22, x22, #0x1\n"
-      "ld1h { z0.h }, p0/Z, [x26]\n"
-      ".inst 0xa140a6fb  // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23]\n"
-      "ld1h { z13.h }, p0/Z, [x26, #1, MUL VL]\n"
-      ".inst 0xa141a6ea  // ldnt1h { z2.h, z6.h, z10.h, z14.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      "ld1h { z12.h }, p0/Z, [x26, #2, MUL VL]\n"
-      ".inst 0xa142a6eb  // ldnt1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
-      "ld1h { z26.h }, p0/Z, [x26, #3, MUL VL]\n"
+      "ld1h { z28.h }, p0/Z, [x26]\n"
+      ".inst 0xa040a6e9  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x23]\n"
+      "ld1h { z22.h }, p0/Z, [x26, #1, MUL VL]\n"
+      ".inst 0xa041a6ed  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      "ld1h { z30.h }, p0/Z, [x26, #2, MUL VL]\n"
+      ".inst 0xa042a6e5  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      "ld1h { z20.h }, p0/Z, [x26, #3, MUL VL]\n"
       "addvl x26, x26, #4\n"
-      ".inst 0xa143a6f8  // ldnt1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      ".inst 0xa143a6fb  // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
       "addvl x23, x23, #16\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0x81930000  // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n"
+      ".inst 0x81880380  // bfmopa za0.s, p0/M, p0/M, z28.h, z8.h\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0x81970001  // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n"
-      ".inst 0x819b0002  // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n"
-      ".inst 0x819f0003  // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n"
-      "ld1h { z0.h }, p0/Z, [x26]\n"
-      ".inst 0x818201a0  // bfmopa za0.s, p0/M, p0/M, z13.h, z2.h\n"
-      ".inst 0xa140a6fb  // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23]\n"
-      ".inst 0x818601a1  // bfmopa za1.s, p0/M, p0/M, z13.h, z6.h\n"
-      ".inst 0x818a01a2  // bfmopa za2.s, p0/M, p0/M, z13.h, z10.h\n"
-      ".inst 0x818e01a3  // bfmopa za3.s, p0/M, p0/M, z13.h, z14.h\n"
-      "ld1h { z13.h }, p0/Z, [x26, #1, MUL VL]\n"
-      ".inst 0x81830180  // bfmopa za0.s, p0/M, p0/M, z12.h, z3.h\n"
-      ".inst 0xa141a6ea  // ldnt1h { z2.h, z6.h, z10.h, z14.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      ".inst 0x81870181  // bfmopa za1.s, p0/M, p0/M, z12.h, z7.h\n"
-      ".inst 0x818b0182  // bfmopa za2.s, p0/M, p0/M, z12.h, z11.h\n"
-      ".inst 0x818f0183  // bfmopa za3.s, p0/M, p0/M, z12.h, z15.h\n"
-      "ld1h { z12.h }, p0/Z, [x26, #2, MUL VL]\n"
-      ".inst 0xa142a6eb  // ldnt1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
-      ".inst 0x81900340  // bfmopa za0.s, p0/M, p0/M, z26.h, z16.h\n"
-      ".inst 0x81940341  // bfmopa za1.s, p0/M, p0/M, z26.h, z20.h\n"
-      ".inst 0x81980342  // bfmopa za2.s, p0/M, p0/M, z26.h, z24.h\n"
-      ".inst 0x819c0343  // bfmopa za3.s, p0/M, p0/M, z26.h, z28.h\n"
-      "ld1h { z26.h }, p0/Z, [x26, #3, MUL VL]\n"
+      ".inst 0x81890381  // bfmopa za1.s, p0/M, p0/M, z28.h, z9.h\n"
+      ".inst 0x818a0382  // bfmopa za2.s, p0/M, p0/M, z28.h, z10.h\n"
+      ".inst 0x818b0383  // bfmopa za3.s, p0/M, p0/M, z28.h, z11.h\n"
+      "ld1h { z28.h }, p0/Z, [x26]\n"
+      ".inst 0x818c02c0  // bfmopa za0.s, p0/M, p0/M, z22.h, z12.h\n"
+      ".inst 0xa040a6e9  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x23]\n"
+      ".inst 0x818d02c1  // bfmopa za1.s, p0/M, p0/M, z22.h, z13.h\n"
+      ".inst 0x818e02c2  // bfmopa za2.s, p0/M, p0/M, z22.h, z14.h\n"
+      ".inst 0x818f02c3  // bfmopa za3.s, p0/M, p0/M, z22.h, z15.h\n"
+      "ld1h { z22.h }, p0/Z, [x26, #1, MUL VL]\n"
+      ".inst 0x818403c0  // bfmopa za0.s, p0/M, p0/M, z30.h, z4.h\n"
+      ".inst 0xa041a6ed  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0x818503c1  // bfmopa za1.s, p0/M, p0/M, z30.h, z5.h\n"
+      ".inst 0x818603c2  // bfmopa za2.s, p0/M, p0/M, z30.h, z6.h\n"
+      ".inst 0x818703c3  // bfmopa za3.s, p0/M, p0/M, z30.h, z7.h\n"
+      "ld1h { z30.h }, p0/Z, [x26, #2, MUL VL]\n"
+      ".inst 0xa042a6e5  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0x81930280  // bfmopa za0.s, p0/M, p0/M, z20.h, z19.h\n"
+      ".inst 0x81970281  // bfmopa za1.s, p0/M, p0/M, z20.h, z23.h\n"
+      ".inst 0x819b0282  // bfmopa za2.s, p0/M, p0/M, z20.h, z27.h\n"
+      ".inst 0x819f0283  // bfmopa za3.s, p0/M, p0/M, z20.h, z31.h\n"
+      "ld1h { z20.h }, p0/Z, [x26, #3, MUL VL]\n"
       "addvl x26, x26, #4\n"
-      ".inst 0xa143a6f8  // ldnt1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      ".inst 0xa143a6fb  // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
       "addvl x23, x23, #16\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0x81930000  // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n"
-      ".inst 0x81970001  // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n"
-      ".inst 0x819b0002  // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n"
-      ".inst 0x819f0003  // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n"
-      ".inst 0x818201a0  // bfmopa za0.s, p0/M, p0/M, z13.h, z2.h\n"
-      ".inst 0x818601a1  // bfmopa za1.s, p0/M, p0/M, z13.h, z6.h\n"
-      ".inst 0x818a01a2  // bfmopa za2.s, p0/M, p0/M, z13.h, z10.h\n"
-      ".inst 0x818e01a3  // bfmopa za3.s, p0/M, p0/M, z13.h, z14.h\n"
-      ".inst 0x81830180  // bfmopa za0.s, p0/M, p0/M, z12.h, z3.h\n"
-      ".inst 0x81870181  // bfmopa za1.s, p0/M, p0/M, z12.h, z7.h\n"
-      ".inst 0x818b0182  // bfmopa za2.s, p0/M, p0/M, z12.h, z11.h\n"
-      ".inst 0x818f0183  // bfmopa za3.s, p0/M, p0/M, z12.h, z15.h\n"
-      ".inst 0x81900340  // bfmopa za0.s, p0/M, p0/M, z26.h, z16.h\n"
-      ".inst 0x81940341  // bfmopa za1.s, p0/M, p0/M, z26.h, z20.h\n"
-      ".inst 0x81980342  // bfmopa za2.s, p0/M, p0/M, z26.h, z24.h\n"
-      ".inst 0x819c0343  // bfmopa za3.s, p0/M, p0/M, z26.h, z28.h\n"
+      ".inst 0x81880380  // bfmopa za0.s, p0/M, p0/M, z28.h, z8.h\n"
+      ".inst 0x81890381  // bfmopa za1.s, p0/M, p0/M, z28.h, z9.h\n"
+      ".inst 0x818a0382  // bfmopa za2.s, p0/M, p0/M, z28.h, z10.h\n"
+      ".inst 0x818b0383  // bfmopa za3.s, p0/M, p0/M, z28.h, z11.h\n"
+      ".inst 0x818c02c0  // bfmopa za0.s, p0/M, p0/M, z22.h, z12.h\n"
+      ".inst 0x818d02c1  // bfmopa za1.s, p0/M, p0/M, z22.h, z13.h\n"
+      ".inst 0x818e02c2  // bfmopa za2.s, p0/M, p0/M, z22.h, z14.h\n"
+      ".inst 0x818f02c3  // bfmopa za3.s, p0/M, p0/M, z22.h, z15.h\n"
+      ".inst 0x818403c0  // bfmopa za0.s, p0/M, p0/M, z30.h, z4.h\n"
+      ".inst 0x818503c1  // bfmopa za1.s, p0/M, p0/M, z30.h, z5.h\n"
+      ".inst 0x818603c2  // bfmopa za2.s, p0/M, p0/M, z30.h, z6.h\n"
+      ".inst 0x818703c3  // bfmopa za3.s, p0/M, p0/M, z30.h, z7.h\n"
+      ".inst 0x81930280  // bfmopa za0.s, p0/M, p0/M, z20.h, z19.h\n"
+      ".inst 0x81970281  // bfmopa za1.s, p0/M, p0/M, z20.h, z23.h\n"
+      ".inst 0x819b0282  // bfmopa za2.s, p0/M, p0/M, z20.h, z27.h\n"
+      ".inst 0x819f0283  // bfmopa za3.s, p0/M, p0/M, z20.h, z31.h\n"
       "8:"  // K oddments
       "cbz x21, 10f\n"
       "9:"  // K oddments: Loop
-      "ld1h { z0.h }, p0/Z, [x26]\n"
+      "ld1h { z8.h }, p0/Z, [x26]\n"
       "subs x21, x21, #0x1\n"
       "addvl x26, x26, #1\n"
-      ".inst 0xa140a6f3  // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23]\n"
+      ".inst 0xa140a6e3  // ld1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x23]\n"
       "addvl x23, x23, #4\n"
-      ".inst 0x81930000  // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n"
-      ".inst 0x81970001  // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n"
-      ".inst 0x819b0002  // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n"
-      ".inst 0x819f0003  // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n"
+      ".inst 0x81830100  // bfmopa za0.s, p0/M, p0/M, z8.h, z3.h\n"
+      ".inst 0x81870101  // bfmopa za1.s, p0/M, p0/M, z8.h, z7.h\n"
+      ".inst 0x818b0102  // bfmopa za2.s, p0/M, p0/M, z8.h, z11.h\n"
+      ".inst 0x818f0103  // bfmopa za3.s, p0/M, p0/M, z8.h, z15.h\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
       "tbz x15, #1, 14f\n"
@@ -242,25 +241,25 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
-      ".inst 0xa040c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
-      ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
-      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
-      ".inst 0xa041c5dc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
-      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
-      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
-      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
-      ".inst 0xa042c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
-      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xa043c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
-      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa040c5d4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa060c5b8  // st1w { z24.s-z27.s }, pn9.b, [x13]\n"
+      ".inst 0xa060c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
       "addvl x14, x14, #16\n"
-      ".inst 0xa061c5b4  // st1w { z20.s-z23.s }, pn9.b, [x13, #0x4, MUL VL]\n"
-      ".inst 0xa062c5bc  // st1w { z28.s-z31.s }, pn9.b, [x13, #0x8, MUL VL]\n"
-      ".inst 0xa063c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      ".inst 0xa061c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa062c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5a0  // st1w { z0.s-z3.s }, pn9.b, [x13, #0xc, MUL VL]\n"
       "addvl x13, x13, #16\n"
       "blt 11b\n"
       "b 24f\n"
@@ -268,15 +267,15 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
-      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
-      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
-      ".inst 0xa060c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13]\n"
-      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
       ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
-      ".inst 0xa061c5bc  // st1w { z28.s-z31.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa061c5b8  // st1w { z24.s-z27.s }, pn9.b, [x13, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa062c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa062c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13, #0x8, MUL VL]\n"
       ".inst 0xa063c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13, #0xc, MUL VL]\n"
       "addvl x13, x13, #16\n"
       "blt 13b\n"
@@ -314,18 +313,18 @@
       "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
       "cbz x20, 17f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
-      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
-      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      ".inst 0xa160c320  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa160c330  // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "beq 17f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa160c321  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+      ".inst 0xa160c331  // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "beq 17f\n"
-      ".inst 0xa160c322  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+      ".inst 0xa160c332  // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
       "subs x24, x24, x22\n"
@@ -334,66 +333,66 @@
       "18:"  // Store to output array: Skip activation: End
       "cntw x20\n"
       "cmp x24, x20\n"
-      "ld1rw { z23.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
       "csel x20, x24, x20, LT\n"
       "lsr x21, x20, #0x2\n"
-      "ld1rw { z16.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
       "mov x12, #0x0\n"
       "and x20, x20, #0x3\n"
       "cbz x21, 20f\n"
       "19:"  // Store to output array: Accumulator row 0 loop
-      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
-      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
-      ".inst 0xc1b0cae0  // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
-      ".inst 0xc1b0cae4  // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
-      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      ".inst 0xc1b0cae8  // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
-      ".inst 0xc1b0caec  // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
-      ".inst 0xa160c320  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c834  // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c83c  // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+      ".inst 0xa160c330  // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "add x12, x12, #0x4\n"
-      ".inst 0xa160c321  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+      ".inst 0xa160c331  // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xa160c322  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+      ".inst 0xa160c332  // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
-      ".inst 0xa160c323  // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x25]\n"
+      ".inst 0xa160c333  // st1w { z19.s, z23.s, z27.s, z31.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "blt 19b\n"
       "20:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 21f\n"
-      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
-      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
-      ".inst 0xc1b0cae0  // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
-      ".inst 0xc1b0cae4  // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
-      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      ".inst 0xc1b0cae8  // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
-      ".inst 0xc1b0caec  // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c834  // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c83c  // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa160c320  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+      ".inst 0xa160c330  // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "beq 21f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa160c321  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+      ".inst 0xa160c331  // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "beq 21f\n"
-      ".inst 0xa160c322  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+      ".inst 0xa160c332  // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
       "21:"  // Store to output array: Accumulator row 0 oddments: End
       "22:"  // Store to output array: End
       "tbz x15, #0, 24f\n"
       "mov x12, #0x0\n"
       "cntw x20\n"
       "23:"  // Store to output array: Refill accumulators: Loop
-      ".inst 0xa040c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n"
-      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa040c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
       ".inst 0xa041c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
       ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
       ".inst 0xa042c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
       ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa043c5c8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
-      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa043c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x14, x14, #16\n"
@@ -417,4 +416,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
index 89c79cf..ca7b057 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "../bfloat.hpp"
 #include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 2, 2, 2> transforms = {};
 
-  cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
index b63f211..20c1de9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -113,14 +112,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "1:"  // Initial accumulator load from buffer: Loop
-      ".inst 0xa040c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
-      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
-      ".inst 0xa041c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa042c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
-      ".inst 0xa043c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa040c5e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840501  // mova za1h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa042c5f4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -138,12 +137,12 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      "fmov z21.s, #1.0\n"
-      ".inst 0xa00a428f  // ldnt1w { z14.s-z15.s }, p8/Z, [x20, x10, LSL #2]\n"
-      ".inst 0x808e02a0  // fmopa za0.s, p0/M, p0/M, z21.s, z14.s\n"
-      ".inst 0x808f02a1  // fmopa za1.s, p0/M, p0/M, z21.s, z15.s\n"
-      ".inst 0x808e02a2  // fmopa za2.s, p0/M, p0/M, z21.s, z14.s\n"
-      ".inst 0x808f02a3  // fmopa za3.s, p0/M, p0/M, z21.s, z15.s\n"
+      "fmov z12.s, #1.0\n"
+      ".inst 0xa10a4289  // ldnt1w { z1.s, z9.s }, p8/Z, [x20, x10, LSL #2]\n"
+      ".inst 0x80810180  // fmopa za0.s, p0/M, p0/M, z12.s, z1.s\n"
+      ".inst 0x80890181  // fmopa za1.s, p0/M, p0/M, z12.s, z9.s\n"
+      ".inst 0x80810182  // fmopa za2.s, p0/M, p0/M, z12.s, z1.s\n"
+      ".inst 0x80890183  // fmopa za3.s, p0/M, p0/M, z12.s, z9.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x10\n"
       "mov x21, x11\n"
@@ -166,75 +165,75 @@
       "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
       "cbz x22, 8f\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa1402767  // ld1h { z7.h, z15.h }, pn9.b/Z, [x27]\n"
-      ".inst 0xa14026ff  // ldnt1h { z23.h, z31.h }, pn9.b/Z, [x23]\n"
-      ".inst 0xa0412768  // ld1h { z8.h-z9.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
-      ".inst 0xa04126e3  // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
-      ".inst 0xa1422772  // ld1h { z18.h, z26.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0xa04226f1  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      ".inst 0xa1432776  // ld1h { z22.h, z30.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      ".inst 0xa0402772  // ld1h { z18.h-z19.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xa04026e3  // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23]\n"
+      ".inst 0xa0412764  // ld1h { z4.h-z5.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa04126fb  // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa042276a  // ld1h { z10.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa04226f5  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa0432766  // ld1h { z6.h-z7.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
       "addvl x27, x27, #8\n"
-      ".inst 0xa14326ec  // ldnt1h { z4.h, z12.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      ".inst 0xa04326e9  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
       "addvl x23, x23, #8\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0x819700e0  // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n"
+      ".inst 0x81820240  // bfmopa za0.s, p0/M, p0/M, z18.h, z2.h\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0x819f00e1  // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n"
-      ".inst 0x819701e2  // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n"
-      ".inst 0x819f01e3  // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n"
-      ".inst 0xa1402767  // ld1h { z7.h, z15.h }, pn9.b/Z, [x27]\n"
-      ".inst 0x81820100  // bfmopa za0.s, p0/M, p0/M, z8.h, z2.h\n"
-      ".inst 0xa14026ff  // ldnt1h { z23.h, z31.h }, pn9.b/Z, [x23]\n"
-      ".inst 0x81830101  // bfmopa za1.s, p0/M, p0/M, z8.h, z3.h\n"
-      ".inst 0x81820122  // bfmopa za2.s, p0/M, p0/M, z9.h, z2.h\n"
-      ".inst 0x81830123  // bfmopa za3.s, p0/M, p0/M, z9.h, z3.h\n"
-      ".inst 0xa0412768  // ld1h { z8.h-z9.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
-      ".inst 0x81900240  // bfmopa za0.s, p0/M, p0/M, z18.h, z16.h\n"
-      ".inst 0xa04126e3  // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
-      ".inst 0x81910241  // bfmopa za1.s, p0/M, p0/M, z18.h, z17.h\n"
-      ".inst 0x81900342  // bfmopa za2.s, p0/M, p0/M, z26.h, z16.h\n"
-      ".inst 0x81910343  // bfmopa za3.s, p0/M, p0/M, z26.h, z17.h\n"
-      ".inst 0xa1422772  // ld1h { z18.h, z26.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0xa04226f1  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      ".inst 0x818402c0  // bfmopa za0.s, p0/M, p0/M, z22.h, z4.h\n"
-      ".inst 0x818c02c1  // bfmopa za1.s, p0/M, p0/M, z22.h, z12.h\n"
-      ".inst 0x818403c2  // bfmopa za2.s, p0/M, p0/M, z30.h, z4.h\n"
-      ".inst 0x818c03c3  // bfmopa za3.s, p0/M, p0/M, z30.h, z12.h\n"
-      ".inst 0xa1432776  // ld1h { z22.h, z30.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      ".inst 0x81830241  // bfmopa za1.s, p0/M, p0/M, z18.h, z3.h\n"
+      ".inst 0x81820262  // bfmopa za2.s, p0/M, p0/M, z19.h, z2.h\n"
+      ".inst 0x81830263  // bfmopa za3.s, p0/M, p0/M, z19.h, z3.h\n"
+      ".inst 0xa0402772  // ld1h { z18.h-z19.h }, pn9.b/Z, [x27]\n"
+      ".inst 0x819a0080  // bfmopa za0.s, p0/M, p0/M, z4.h, z26.h\n"
+      ".inst 0xa04026e3  // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23]\n"
+      ".inst 0x819b0081  // bfmopa za1.s, p0/M, p0/M, z4.h, z27.h\n"
+      ".inst 0x819a00a2  // bfmopa za2.s, p0/M, p0/M, z5.h, z26.h\n"
+      ".inst 0x819b00a3  // bfmopa za3.s, p0/M, p0/M, z5.h, z27.h\n"
+      ".inst 0xa0412764  // ld1h { z4.h-z5.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0x81940140  // bfmopa za0.s, p0/M, p0/M, z10.h, z20.h\n"
+      ".inst 0xa04126fb  // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0x81950141  // bfmopa za1.s, p0/M, p0/M, z10.h, z21.h\n"
+      ".inst 0x81940162  // bfmopa za2.s, p0/M, p0/M, z11.h, z20.h\n"
+      ".inst 0x81950163  // bfmopa za3.s, p0/M, p0/M, z11.h, z21.h\n"
+      ".inst 0xa042276a  // ld1h { z10.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa04226f5  // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0x818800c0  // bfmopa za0.s, p0/M, p0/M, z6.h, z8.h\n"
+      ".inst 0x818900c1  // bfmopa za1.s, p0/M, p0/M, z6.h, z9.h\n"
+      ".inst 0x818800e2  // bfmopa za2.s, p0/M, p0/M, z7.h, z8.h\n"
+      ".inst 0x818900e3  // bfmopa za3.s, p0/M, p0/M, z7.h, z9.h\n"
+      ".inst 0xa0432766  // ld1h { z6.h-z7.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
       "addvl x27, x27, #8\n"
-      ".inst 0xa14326ec  // ldnt1h { z4.h, z12.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      ".inst 0xa04326e9  // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
       "addvl x23, x23, #8\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0x819700e0  // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n"
-      ".inst 0x819f00e1  // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n"
-      ".inst 0x819701e2  // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n"
-      ".inst 0x819f01e3  // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n"
-      ".inst 0x81820100  // bfmopa za0.s, p0/M, p0/M, z8.h, z2.h\n"
-      ".inst 0x81830101  // bfmopa za1.s, p0/M, p0/M, z8.h, z3.h\n"
-      ".inst 0x81820122  // bfmopa za2.s, p0/M, p0/M, z9.h, z2.h\n"
-      ".inst 0x81830123  // bfmopa za3.s, p0/M, p0/M, z9.h, z3.h\n"
-      ".inst 0x81900240  // bfmopa za0.s, p0/M, p0/M, z18.h, z16.h\n"
-      ".inst 0x81910241  // bfmopa za1.s, p0/M, p0/M, z18.h, z17.h\n"
-      ".inst 0x81900342  // bfmopa za2.s, p0/M, p0/M, z26.h, z16.h\n"
-      ".inst 0x81910343  // bfmopa za3.s, p0/M, p0/M, z26.h, z17.h\n"
-      ".inst 0x818402c0  // bfmopa za0.s, p0/M, p0/M, z22.h, z4.h\n"
-      ".inst 0x818c02c1  // bfmopa za1.s, p0/M, p0/M, z22.h, z12.h\n"
-      ".inst 0x818403c2  // bfmopa za2.s, p0/M, p0/M, z30.h, z4.h\n"
-      ".inst 0x818c03c3  // bfmopa za3.s, p0/M, p0/M, z30.h, z12.h\n"
+      ".inst 0x81820240  // bfmopa za0.s, p0/M, p0/M, z18.h, z2.h\n"
+      ".inst 0x81830241  // bfmopa za1.s, p0/M, p0/M, z18.h, z3.h\n"
+      ".inst 0x81820262  // bfmopa za2.s, p0/M, p0/M, z19.h, z2.h\n"
+      ".inst 0x81830263  // bfmopa za3.s, p0/M, p0/M, z19.h, z3.h\n"
+      ".inst 0x819a0080  // bfmopa za0.s, p0/M, p0/M, z4.h, z26.h\n"
+      ".inst 0x819b0081  // bfmopa za1.s, p0/M, p0/M, z4.h, z27.h\n"
+      ".inst 0x819a00a2  // bfmopa za2.s, p0/M, p0/M, z5.h, z26.h\n"
+      ".inst 0x819b00a3  // bfmopa za3.s, p0/M, p0/M, z5.h, z27.h\n"
+      ".inst 0x81940140  // bfmopa za0.s, p0/M, p0/M, z10.h, z20.h\n"
+      ".inst 0x81950141  // bfmopa za1.s, p0/M, p0/M, z10.h, z21.h\n"
+      ".inst 0x81940162  // bfmopa za2.s, p0/M, p0/M, z11.h, z20.h\n"
+      ".inst 0x81950163  // bfmopa za3.s, p0/M, p0/M, z11.h, z21.h\n"
+      ".inst 0x818800c0  // bfmopa za0.s, p0/M, p0/M, z6.h, z8.h\n"
+      ".inst 0x818900c1  // bfmopa za1.s, p0/M, p0/M, z6.h, z9.h\n"
+      ".inst 0x818800e2  // bfmopa za2.s, p0/M, p0/M, z7.h, z8.h\n"
+      ".inst 0x818900e3  // bfmopa za3.s, p0/M, p0/M, z7.h, z9.h\n"
       "8:"  // K oddments
       "cbz x21, 10f\n"
       "9:"  // K oddments: Loop
-      ".inst 0xa1402767  // ld1h { z7.h, z15.h }, pn9.b/Z, [x27]\n"
+      ".inst 0xa040277e  // ld1h { z30.h-z31.h }, pn9.b/Z, [x27]\n"
       "subs x21, x21, #0x1\n"
       "addvl x27, x27, #2\n"
-      ".inst 0xa14026f7  // ld1h { z23.h, z31.h }, pn9.b/Z, [x23]\n"
+      ".inst 0xa14026e5  // ld1h { z5.h, z13.h }, pn9.b/Z, [x23]\n"
       "addvl x23, x23, #2\n"
-      ".inst 0x819700e0  // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n"
-      ".inst 0x819f00e1  // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n"
-      ".inst 0x819701e2  // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n"
-      ".inst 0x819f01e3  // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n"
+      ".inst 0x818503c0  // bfmopa za0.s, p0/M, p0/M, z30.h, z5.h\n"
+      ".inst 0x818d03c1  // bfmopa za1.s, p0/M, p0/M, z30.h, z13.h\n"
+      ".inst 0x818503e2  // bfmopa za2.s, p0/M, p0/M, z31.h, z5.h\n"
+      ".inst 0x818d03e3  // bfmopa za3.s, p0/M, p0/M, z31.h, z13.h\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
       "tbz x16, #1, 14f\n"
@@ -243,24 +242,24 @@
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
       ".inst 0xa040c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
-      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
       ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
-      ".inst 0xa041c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
-      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
-      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
-      ".inst 0xa042c5f8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
-      ".inst 0xa043c5f8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840703  // mova za3h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa060c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
+      ".inst 0xa060c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
       "addvl x15, x15, #16\n"
-      ".inst 0xa061c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0x4, MUL VL]\n"
-      ".inst 0xa062c5d4  // st1w { z20.s-z23.s }, pn9.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa061c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c5dc  // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 11b\n"
       "b 30f\n"
@@ -268,16 +267,16 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
-      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
-      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
-      ".inst 0xa060c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
-      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
-      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
-      ".inst 0xa061c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa062c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa062c5d8  // st1w { z24.s-z27.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 13b\n"
       "b 30f\n"
@@ -312,16 +311,16 @@
       "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
       "cbz x20, 17f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
-      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
-      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
+      ".inst 0xc0860414  // mova { z20.s-z23.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa1604354  // st1w { z20.s, z28.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "beq 17f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
+      ".inst 0xa1604355  // st1w { z21.s, z29.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "beq 17f\n"
-      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
+      ".inst 0xa1604356  // st1w { z22.s, z30.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
       "subs x25, x25, x22\n"
@@ -349,16 +348,16 @@
       "19:"  // Store to output array: Skip activation: Accumulator row 1 oddments
       "cbz x20, 20f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
-      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
-      ".inst 0xa1604354  // st1w { z20.s, z28.s }, p8, [x26]\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "beq 20f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa1604355  // st1w { z21.s, z29.s }, p8, [x26]\n"
+      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "beq 20f\n"
-      ".inst 0xa1604356  // st1w { z22.s, z30.s }, p8, [x26]\n"
+      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "20:"  // Store to output array: Skip activation: Accumulator row 1 oddments: End
       "subs x25, x25, x22\n"
@@ -367,44 +366,44 @@
       "21:"  // Store to output array: Skip activation: End
       "cntw x23\n"
       "cmp x25, x23\n"
-      "ld1rw { z21.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
       "csel x22, x25, x23, LT\n"
       "lsr x21, x22, #0x2\n"
-      "ld1rw { z20.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
       "mov x12, #0x0\n"
       "and x20, x22, #0x3\n"
       "cbz x21, 23f\n"
       "22:"  // Store to output array: Accumulator row 0 loop
-      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
-      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
-      ".inst 0xc1b4caa4  // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
-      ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
-      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
+      ".inst 0xc0860414  // mova { z20.s-z23.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c834  // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c83c  // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+      ".inst 0xa1604354  // st1w { z20.s, z28.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
+      ".inst 0xa1604355  // st1w { z21.s, z29.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
+      ".inst 0xa1604356  // st1w { z22.s, z30.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
-      ".inst 0xa1604347  // st1w { z7.s, z15.s }, p8, [x26]\n"
+      ".inst 0xa1604357  // st1w { z23.s, z31.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "blt 22b\n"
       "23:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 24f\n"
-      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
-      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
-      ".inst 0xc1b4caa0  // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
-      ".inst 0xc1b4caa8  // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa1604340  // st1w { z0.s, z8.s }, p8, [x26]\n"
+      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "beq 24f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa1604341  // st1w { z1.s, z9.s }, p8, [x26]\n"
+      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "beq 24f\n"
-      ".inst 0xa1604342  // st1w { z2.s, z10.s }, p8, [x26]\n"
+      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "24:"  // Store to output array: Accumulator row 0 oddments: End
       "subs x25, x25, x22\n"
@@ -418,8 +417,8 @@
       "25:"  // Store to output array: Accumulator row 1 loop
       ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
       ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
-      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
-      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
       ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
@@ -435,8 +434,8 @@
       "cbz x20, 27f\n"
       ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
       ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
-      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
-      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
       "subs x20, x20, #0x1\n"
       ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
@@ -452,14 +451,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "29:"  // Store to output array: Refill accumulators: Loop
-      ".inst 0xa040c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
-      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa040c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
       ".inst 0xa041c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
       ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
       ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
       ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa043c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa043c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -483,4 +482,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
index 0d407e0..7b31d6d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "../bfloat.hpp"
 #include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 4, 1, 2> transforms = {};
 
-  cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
index a51b3db..70c94d3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -113,14 +112,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "1:"  // Initial accumulator load from buffer: Loop
-      ".inst 0xa040c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
-      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
-      ".inst 0xa041c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
-      ".inst 0xa042c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
-      ".inst 0xa043c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa040c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa042c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -138,12 +137,12 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      "fmov z8.s, #1.0\n"
-      "ldnt1w { z27.s }, p0/Z, [x20, x10, LSL #2]\n"
-      ".inst 0x809b2500  // fmopa za0.s, p1/M, p1/M, z8.s, z27.s\n"
-      ".inst 0x809b2501  // fmopa za1.s, p1/M, p1/M, z8.s, z27.s\n"
-      ".inst 0x809b2502  // fmopa za2.s, p1/M, p1/M, z8.s, z27.s\n"
-      ".inst 0x809b2503  // fmopa za3.s, p1/M, p1/M, z8.s, z27.s\n"
+      "fmov z11.s, #1.0\n"
+      "ldnt1w { z13.s }, p0/Z, [x20, x10, LSL #2]\n"
+      ".inst 0x808d2560  // fmopa za0.s, p1/M, p1/M, z11.s, z13.s\n"
+      ".inst 0x808d2561  // fmopa za1.s, p1/M, p1/M, z11.s, z13.s\n"
+      ".inst 0x808d2562  // fmopa za2.s, p1/M, p1/M, z11.s, z13.s\n"
+      ".inst 0x808d2563  // fmopa za3.s, p1/M, p1/M, z11.s, z13.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x10\n"
       "mov x21, x11\n"
@@ -166,75 +165,75 @@
       "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
       "cbz x22, 8f\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa040a364  // ld1h { z4.h-z7.h }, pn8.b/Z, [x27]\n"
-      "ldnt1h { z29.h }, p1/Z, [x23]\n"
-      ".inst 0xa041a36c  // ld1h { z12.h-z15.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
-      "ldnt1h { z23.h }, p1/Z, [x23, #1, MUL VL]\n"
-      ".inst 0xa042a360  // ld1h { z0.h-z3.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
-      "ldnt1h { z21.h }, p1/Z, [x23, #2, MUL VL]\n"
-      ".inst 0xa143a372  // ld1h { z18.h, z22.h, z26.h, z30.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xa140a360  // ld1h { z0.h, z4.h, z8.h, z12.h }, pn8.b/Z, [x27]\n"
+      "ldnt1h { z19.h }, p1/Z, [x23]\n"
+      ".inst 0xa141a371  // ld1h { z17.h, z21.h, z25.h, z29.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      "ldnt1h { z22.h }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa142a370  // ld1h { z16.h, z20.h, z24.h, z28.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1h { z23.h }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa143a363  // ld1h { z3.h, z7.h, z11.h, z15.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
       "addvl x27, x27, #16\n"
-      "ldnt1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "ldnt1h { z2.h }, p1/Z, [x23, #3, MUL VL]\n"
       "addvl x23, x23, #4\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0x819d2480  // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n"
+      ".inst 0x81932400  // bfmopa za0.s, p1/M, p1/M, z0.h, z19.h\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0x819d24a1  // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n"
-      ".inst 0x819d24c2  // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n"
-      ".inst 0x819d24e3  // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n"
-      ".inst 0xa040a364  // ld1h { z4.h-z7.h }, pn8.b/Z, [x27]\n"
-      ".inst 0x81972580  // bfmopa za0.s, p1/M, p1/M, z12.h, z23.h\n"
-      "ldnt1h { z29.h }, p1/Z, [x23]\n"
-      ".inst 0x819725a1  // bfmopa za1.s, p1/M, p1/M, z13.h, z23.h\n"
-      ".inst 0x819725c2  // bfmopa za2.s, p1/M, p1/M, z14.h, z23.h\n"
-      ".inst 0x819725e3  // bfmopa za3.s, p1/M, p1/M, z15.h, z23.h\n"
-      ".inst 0xa041a36c  // ld1h { z12.h-z15.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0x81952400  // bfmopa za0.s, p1/M, p1/M, z0.h, z21.h\n"
-      "ldnt1h { z23.h }, p1/Z, [x23, #1, MUL VL]\n"
-      ".inst 0x81952421  // bfmopa za1.s, p1/M, p1/M, z1.h, z21.h\n"
-      ".inst 0x81952442  // bfmopa za2.s, p1/M, p1/M, z2.h, z21.h\n"
-      ".inst 0x81952463  // bfmopa za3.s, p1/M, p1/M, z3.h, z21.h\n"
-      ".inst 0xa042a360  // ld1h { z0.h-z3.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
-      "ldnt1h { z21.h }, p1/Z, [x23, #2, MUL VL]\n"
-      ".inst 0x819b2640  // bfmopa za0.s, p1/M, p1/M, z18.h, z27.h\n"
-      ".inst 0x819b26c1  // bfmopa za1.s, p1/M, p1/M, z22.h, z27.h\n"
-      ".inst 0x819b2742  // bfmopa za2.s, p1/M, p1/M, z26.h, z27.h\n"
-      ".inst 0x819b27c3  // bfmopa za3.s, p1/M, p1/M, z30.h, z27.h\n"
-      ".inst 0xa143a372  // ld1h { z18.h, z22.h, z26.h, z30.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0x81932481  // bfmopa za1.s, p1/M, p1/M, z4.h, z19.h\n"
+      ".inst 0x81932502  // bfmopa za2.s, p1/M, p1/M, z8.h, z19.h\n"
+      ".inst 0x81932583  // bfmopa za3.s, p1/M, p1/M, z12.h, z19.h\n"
+      ".inst 0xa140a360  // ld1h { z0.h, z4.h, z8.h, z12.h }, pn8.b/Z, [x27]\n"
+      ".inst 0x81962620  // bfmopa za0.s, p1/M, p1/M, z17.h, z22.h\n"
+      "ldnt1h { z19.h }, p1/Z, [x23]\n"
+      ".inst 0x819626a1  // bfmopa za1.s, p1/M, p1/M, z21.h, z22.h\n"
+      ".inst 0x81962722  // bfmopa za2.s, p1/M, p1/M, z25.h, z22.h\n"
+      ".inst 0x819627a3  // bfmopa za3.s, p1/M, p1/M, z29.h, z22.h\n"
+      ".inst 0xa141a371  // ld1h { z17.h, z21.h, z25.h, z29.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0x81972600  // bfmopa za0.s, p1/M, p1/M, z16.h, z23.h\n"
+      "ldnt1h { z22.h }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0x81972681  // bfmopa za1.s, p1/M, p1/M, z20.h, z23.h\n"
+      ".inst 0x81972702  // bfmopa za2.s, p1/M, p1/M, z24.h, z23.h\n"
+      ".inst 0x81972783  // bfmopa za3.s, p1/M, p1/M, z28.h, z23.h\n"
+      ".inst 0xa142a370  // ld1h { z16.h, z20.h, z24.h, z28.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1h { z23.h }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0x81822460  // bfmopa za0.s, p1/M, p1/M, z3.h, z2.h\n"
+      ".inst 0x818224e1  // bfmopa za1.s, p1/M, p1/M, z7.h, z2.h\n"
+      ".inst 0x81822562  // bfmopa za2.s, p1/M, p1/M, z11.h, z2.h\n"
+      ".inst 0x818225e3  // bfmopa za3.s, p1/M, p1/M, z15.h, z2.h\n"
+      ".inst 0xa143a363  // ld1h { z3.h, z7.h, z11.h, z15.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
       "addvl x27, x27, #16\n"
-      "ldnt1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "ldnt1h { z2.h }, p1/Z, [x23, #3, MUL VL]\n"
       "addvl x23, x23, #4\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0x819d2480  // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n"
-      ".inst 0x819d24a1  // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n"
-      ".inst 0x819d24c2  // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n"
-      ".inst 0x819d24e3  // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n"
-      ".inst 0x81972580  // bfmopa za0.s, p1/M, p1/M, z12.h, z23.h\n"
-      ".inst 0x819725a1  // bfmopa za1.s, p1/M, p1/M, z13.h, z23.h\n"
-      ".inst 0x819725c2  // bfmopa za2.s, p1/M, p1/M, z14.h, z23.h\n"
-      ".inst 0x819725e3  // bfmopa za3.s, p1/M, p1/M, z15.h, z23.h\n"
-      ".inst 0x81952400  // bfmopa za0.s, p1/M, p1/M, z0.h, z21.h\n"
-      ".inst 0x81952421  // bfmopa za1.s, p1/M, p1/M, z1.h, z21.h\n"
-      ".inst 0x81952442  // bfmopa za2.s, p1/M, p1/M, z2.h, z21.h\n"
-      ".inst 0x81952463  // bfmopa za3.s, p1/M, p1/M, z3.h, z21.h\n"
-      ".inst 0x819b2640  // bfmopa za0.s, p1/M, p1/M, z18.h, z27.h\n"
-      ".inst 0x819b26c1  // bfmopa za1.s, p1/M, p1/M, z22.h, z27.h\n"
-      ".inst 0x819b2742  // bfmopa za2.s, p1/M, p1/M, z26.h, z27.h\n"
-      ".inst 0x819b27c3  // bfmopa za3.s, p1/M, p1/M, z30.h, z27.h\n"
+      ".inst 0x81932400  // bfmopa za0.s, p1/M, p1/M, z0.h, z19.h\n"
+      ".inst 0x81932481  // bfmopa za1.s, p1/M, p1/M, z4.h, z19.h\n"
+      ".inst 0x81932502  // bfmopa za2.s, p1/M, p1/M, z8.h, z19.h\n"
+      ".inst 0x81932583  // bfmopa za3.s, p1/M, p1/M, z12.h, z19.h\n"
+      ".inst 0x81962620  // bfmopa za0.s, p1/M, p1/M, z17.h, z22.h\n"
+      ".inst 0x819626a1  // bfmopa za1.s, p1/M, p1/M, z21.h, z22.h\n"
+      ".inst 0x81962722  // bfmopa za2.s, p1/M, p1/M, z25.h, z22.h\n"
+      ".inst 0x819627a3  // bfmopa za3.s, p1/M, p1/M, z29.h, z22.h\n"
+      ".inst 0x81972600  // bfmopa za0.s, p1/M, p1/M, z16.h, z23.h\n"
+      ".inst 0x81972681  // bfmopa za1.s, p1/M, p1/M, z20.h, z23.h\n"
+      ".inst 0x81972702  // bfmopa za2.s, p1/M, p1/M, z24.h, z23.h\n"
+      ".inst 0x81972783  // bfmopa za3.s, p1/M, p1/M, z28.h, z23.h\n"
+      ".inst 0x81822460  // bfmopa za0.s, p1/M, p1/M, z3.h, z2.h\n"
+      ".inst 0x818224e1  // bfmopa za1.s, p1/M, p1/M, z7.h, z2.h\n"
+      ".inst 0x81822562  // bfmopa za2.s, p1/M, p1/M, z11.h, z2.h\n"
+      ".inst 0x818225e3  // bfmopa za3.s, p1/M, p1/M, z15.h, z2.h\n"
       "8:"  // K oddments
       "cbz x21, 10f\n"
       "9:"  // K oddments: Loop
-      ".inst 0xa040a364  // ld1h { z4.h-z7.h }, pn8.b/Z, [x27]\n"
+      ".inst 0xa140a373  // ld1h { z19.h, z23.h, z27.h, z31.h }, pn8.b/Z, [x27]\n"
       "subs x21, x21, #0x1\n"
       "addvl x27, x27, #4\n"
-      "ld1h { z29.h }, p1/Z, [x23]\n"
+      "ld1h { z11.h }, p1/Z, [x23]\n"
       "addvl x23, x23, #1\n"
-      ".inst 0x819d2480  // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n"
-      ".inst 0x819d24a1  // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n"
-      ".inst 0x819d24c2  // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n"
-      ".inst 0x819d24e3  // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n"
+      ".inst 0x818b2660  // bfmopa za0.s, p1/M, p1/M, z19.h, z11.h\n"
+      ".inst 0x818b26e1  // bfmopa za1.s, p1/M, p1/M, z23.h, z11.h\n"
+      ".inst 0x818b2762  // bfmopa za2.s, p1/M, p1/M, z27.h, z11.h\n"
+      ".inst 0x818b27e3  // bfmopa za3.s, p1/M, p1/M, z31.h, z11.h\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
       "tbz x16, #1, 14f\n"
@@ -242,25 +241,25 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
-      ".inst 0xa040c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xa040c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
       ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
-      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
       ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
-      ".inst 0xa041c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
-      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
-      ".inst 0xa042c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840502  // mova za2h.s[x12], { z8.s-z11.s }\n"
-      ".inst 0xa043c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840501  // mova za1h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       ".inst 0xa060c1d8  // st1w { z24.s-z27.s }, pn8.b, [x14]\n"
       "addvl x15, x15, #16\n"
       ".inst 0xa061c1c4  // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n"
-      ".inst 0xa062c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa062c1d0  // st1w { z16.s-z19.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 11b\n"
       "b 42f\n"
@@ -269,15 +268,15 @@
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
       ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
-      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
       ".inst 0xa060c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14]\n"
       ".inst 0xc0860440  // mova { z0.s-z3.s }, za2h.s[x12]\n"
-      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
-      ".inst 0xa061c1d8  // st1w { z24.s-z27.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa061c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       ".inst 0xa062c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c1c8  // st1w { z8.s-z11.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa063c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 13b\n"
       "b 42f\n"
@@ -296,16 +295,16 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 16f\n"
       "15:"  // Store to output array: Skip activation: Accumulator row 0 loop
-      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
-      "st1w { z16.s }, p0, [x26]\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      "st1w { z4.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1w { z17.s }, p0, [x26]\n"
+      "st1w { z5.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      "st1w { z18.s }, p0, [x26]\n"
+      "st1w { z6.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z19.s }, p0, [x26]\n"
+      "st1w { z7.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 15b\n"
       "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
@@ -331,30 +330,30 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 19f\n"
       "18:"  // Store to output array: Skip activation: Accumulator row 1 loop
-      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
-      "st1w { z16.s }, p0, [x26]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      "st1w { z8.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1w { z17.s }, p0, [x26]\n"
+      "st1w { z9.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      "st1w { z18.s }, p0, [x26]\n"
+      "st1w { z10.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z19.s }, p0, [x26]\n"
+      "st1w { z11.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 18b\n"
       "19:"  // Store to output array: Skip activation: Accumulator row 1 oddments
       "cbz x20, 20f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
-      "st1w { z4.s }, p0, [x26]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      "st1w { z24.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 20f\n"
       "subs x20, x20, #0x1\n"
-      "st1w { z5.s }, p0, [x26]\n"
+      "st1w { z25.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 20f\n"
-      "st1w { z6.s }, p0, [x26]\n"
+      "st1w { z26.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "20:"  // Store to output array: Skip activation: Accumulator row 1 oddments: End
       "subs x25, x25, x22\n"
@@ -366,30 +365,30 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 22f\n"
       "21:"  // Store to output array: Skip activation: Accumulator row 2 loop
-      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
-      "st1w { z16.s }, p0, [x26]\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      "st1w { z4.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1w { z17.s }, p0, [x26]\n"
+      "st1w { z5.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      "st1w { z18.s }, p0, [x26]\n"
+      "st1w { z6.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z19.s }, p0, [x26]\n"
+      "st1w { z7.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 21b\n"
       "22:"  // Store to output array: Skip activation: Accumulator row 2 oddments
       "cbz x20, 23f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
-      "st1w { z20.s }, p0, [x26]\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      "st1w { z12.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 23f\n"
       "subs x20, x20, #0x1\n"
-      "st1w { z21.s }, p0, [x26]\n"
+      "st1w { z13.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 23f\n"
-      "st1w { z22.s }, p0, [x26]\n"
+      "st1w { z14.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "23:"  // Store to output array: Skip activation: Accumulator row 2 oddments: End
       "subs x25, x25, x22\n"
@@ -401,30 +400,30 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 25f\n"
       "24:"  // Store to output array: Skip activation: Accumulator row 3 loop
-      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
-      "st1w { z4.s }, p0, [x26]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1w { z5.s }, p0, [x26]\n"
+      "st1w { z17.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      "st1w { z6.s }, p0, [x26]\n"
+      "st1w { z18.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z7.s }, p0, [x26]\n"
+      "st1w { z19.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 24b\n"
       "25:"  // Store to output array: Skip activation: Accumulator row 3 oddments
       "cbz x20, 26f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      "st1w { z12.s }, p0, [x26]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 26f\n"
       "subs x20, x20, #0x1\n"
-      "st1w { z13.s }, p0, [x26]\n"
+      "st1w { z17.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 26f\n"
-      "st1w { z14.s }, p0, [x26]\n"
+      "st1w { z18.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "26:"  // Store to output array: Skip activation: Accumulator row 3 oddments: End
       "subs x25, x25, x22\n"
@@ -433,40 +432,40 @@
       "27:"  // Store to output array: Skip activation: End
       "cntw x23\n"
       "cmp x25, x23\n"
-      "ld1rw { z25.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "ld1rw { z21.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
       "csel x22, x25, x23, LT\n"
       "lsr x21, x22, #0x2\n"
-      "ld1rw { z24.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "ld1rw { z20.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
       "mov x12, #0x0\n"
       "and x20, x22, #0x3\n"
       "cbz x21, 29f\n"
       "28:"  // Store to output array: Accumulator row 0 loop
-      ".inst 0xc0860414  // mova { z20.s-z23.s }, za0h.s[x12]\n"
-      ".inst 0xc1b8cb34  // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
-      "st1w { z20.s }, p0, [x26]\n"
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      ".inst 0xc1b4cabc  // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1w { z28.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1w { z21.s }, p0, [x26]\n"
+      "st1w { z29.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      "st1w { z22.s }, p0, [x26]\n"
+      "st1w { z30.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z23.s }, p0, [x26]\n"
+      "st1w { z31.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 28b\n"
       "29:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 30f\n"
-      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1b8cb28  // fclamp { z8.s-z11.s }, z25.s, z24.s\n"
-      "st1w { z8.s }, p0, [x26]\n"
+      ".inst 0xc1b4cabc  // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1w { z28.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 30f\n"
       "subs x20, x20, #0x1\n"
-      "st1w { z9.s }, p0, [x26]\n"
+      "st1w { z29.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 30f\n"
-      "st1w { z10.s }, p0, [x26]\n"
+      "st1w { z30.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "30:"  // Store to output array: Accumulator row 0 oddments: End
       "subs x25, x25, x22\n"
@@ -478,24 +477,24 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 32f\n"
       "31:"  // Store to output array: Accumulator row 1 loop
-      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
-      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
-      "st1w { z16.s }, p0, [x26]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1b4caa4  // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+      "st1w { z4.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1w { z17.s }, p0, [x26]\n"
+      "st1w { z5.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      "st1w { z18.s }, p0, [x26]\n"
+      "st1w { z6.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z19.s }, p0, [x26]\n"
+      "st1w { z7.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 31b\n"
       "32:"  // Store to output array: Accumulator row 1 oddments
       "cbz x20, 33f\n"
       ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
       "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 33f\n"
@@ -516,7 +515,7 @@
       "cbz x21, 35f\n"
       "34:"  // Store to output array: Accumulator row 2 loop
       ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
-      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
       "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "st1w { z17.s }, p0, [x26]\n"
@@ -532,7 +531,7 @@
       "cbz x20, 36f\n"
       ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
       "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 36f\n"
@@ -552,24 +551,24 @@
       "and x20, x20, #0x3\n"
       "cbz x21, 38f\n"
       "37:"  // Store to output array: Accumulator row 3 loop
-      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
-      ".inst 0xc1b8cb34  // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
-      "st1w { z20.s }, p0, [x26]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1w { z21.s }, p0, [x26]\n"
+      "st1w { z17.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      "st1w { z22.s }, p0, [x26]\n"
+      "st1w { z18.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z23.s }, p0, [x26]\n"
+      "st1w { z19.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 37b\n"
       "38:"  // Store to output array: Accumulator row 3 oddments
       "cbz x20, 39f\n"
       ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
       "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 39f\n"
@@ -588,10 +587,10 @@
       ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
       ".inst 0xa041c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
       ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa042c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
-      ".inst 0xa043c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -615,4 +614,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
index 7777349..bf3de21 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 
 #include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 1, 4, 1> transforms = {};
 
-  cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
index dd99387..97be758 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -113,12 +112,12 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "1:"  // Initial accumulator load from buffer: Loop
-      ".inst 0xa040c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
-      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xa041c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
-      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa042c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
-      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa040c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c5d8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa042c5d4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
       ".inst 0xa043c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
       ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
       "add x12, x12, #0x4\n"
@@ -138,12 +137,12 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      "fmov z21.s, #1.0\n"
+      "fmov z6.s, #1.0\n"
       ".inst 0xa009c29d  // ldnt1w { z28.s-z31.s }, p8/Z, [x20, x9, LSL #2]\n"
-      ".inst 0x809c02a0  // fmopa za0.s, p0/M, p0/M, z21.s, z28.s\n"
-      ".inst 0x809d02a1  // fmopa za1.s, p0/M, p0/M, z21.s, z29.s\n"
-      ".inst 0x809e02a2  // fmopa za2.s, p0/M, p0/M, z21.s, z30.s\n"
-      ".inst 0x809f02a3  // fmopa za3.s, p0/M, p0/M, z21.s, z31.s\n"
+      ".inst 0x809c00c0  // fmopa za0.s, p0/M, p0/M, z6.s, z28.s\n"
+      ".inst 0x809d00c1  // fmopa za1.s, p0/M, p0/M, z6.s, z29.s\n"
+      ".inst 0x809e00c2  // fmopa za2.s, p0/M, p0/M, z6.s, z30.s\n"
+      ".inst 0x809f00c3  // fmopa za3.s, p0/M, p0/M, z6.s, z31.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x9\n"
       "mov x21, x10\n"
@@ -164,75 +163,75 @@
       "madd x21, x9, x20, x21\n"  // bptr = B + n * kstride_bytes
       "cbz x23, 8f\n"
       "subs x23, x23, #0x1\n"
-      "ld1w { z0.s }, p0/Z, [x26]\n"
-      ".inst 0xa140c6bb  // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21]\n"
-      "ld1w { z13.s }, p0/Z, [x26, #1, MUL VL]\n"
-      ".inst 0xa141c6aa  // ldnt1w { z2.s, z6.s, z10.s, z14.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
-      "ld1w { z12.s }, p0/Z, [x26, #2, MUL VL]\n"
-      ".inst 0xa142c6ab  // ldnt1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n"
-      "ld1w { z26.s }, p0/Z, [x26, #3, MUL VL]\n"
+      "ld1w { z28.s }, p0/Z, [x26]\n"
+      ".inst 0xa040c6a9  // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x21]\n"
+      "ld1w { z22.s }, p0/Z, [x26, #1, MUL VL]\n"
+      ".inst 0xa041c6ad  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+      "ld1w { z30.s }, p0/Z, [x26, #2, MUL VL]\n"
+      ".inst 0xa042c6a5  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n"
+      "ld1w { z20.s }, p0/Z, [x26, #3, MUL VL]\n"
       "addvl x26, x26, #4\n"
-      ".inst 0xa143c6b8  // ldnt1w { z16.s, z20.s, z24.s, z28.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n"
+      ".inst 0xa143c6bb  // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n"
       "addvl x21, x21, #16\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0x80930000  // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n"
+      ".inst 0x80880380  // fmopa za0.s, p0/M, p0/M, z28.s, z8.s\n"
       "subs x23, x23, #0x1\n"
-      ".inst 0x80970001  // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n"
-      ".inst 0x809b0002  // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n"
-      ".inst 0x809f0003  // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n"
-      "ld1w { z0.s }, p0/Z, [x26]\n"
-      ".inst 0x808201a0  // fmopa za0.s, p0/M, p0/M, z13.s, z2.s\n"
-      ".inst 0xa140c6bb  // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21]\n"
-      ".inst 0x808601a1  // fmopa za1.s, p0/M, p0/M, z13.s, z6.s\n"
-      ".inst 0x808a01a2  // fmopa za2.s, p0/M, p0/M, z13.s, z10.s\n"
-      ".inst 0x808e01a3  // fmopa za3.s, p0/M, p0/M, z13.s, z14.s\n"
-      "ld1w { z13.s }, p0/Z, [x26, #1, MUL VL]\n"
-      ".inst 0x80830180  // fmopa za0.s, p0/M, p0/M, z12.s, z3.s\n"
-      ".inst 0xa141c6aa  // ldnt1w { z2.s, z6.s, z10.s, z14.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
-      ".inst 0x80870181  // fmopa za1.s, p0/M, p0/M, z12.s, z7.s\n"
-      ".inst 0x808b0182  // fmopa za2.s, p0/M, p0/M, z12.s, z11.s\n"
-      ".inst 0x808f0183  // fmopa za3.s, p0/M, p0/M, z12.s, z15.s\n"
-      "ld1w { z12.s }, p0/Z, [x26, #2, MUL VL]\n"
-      ".inst 0xa142c6ab  // ldnt1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n"
-      ".inst 0x80900340  // fmopa za0.s, p0/M, p0/M, z26.s, z16.s\n"
-      ".inst 0x80940341  // fmopa za1.s, p0/M, p0/M, z26.s, z20.s\n"
-      ".inst 0x80980342  // fmopa za2.s, p0/M, p0/M, z26.s, z24.s\n"
-      ".inst 0x809c0343  // fmopa za3.s, p0/M, p0/M, z26.s, z28.s\n"
-      "ld1w { z26.s }, p0/Z, [x26, #3, MUL VL]\n"
+      ".inst 0x80890381  // fmopa za1.s, p0/M, p0/M, z28.s, z9.s\n"
+      ".inst 0x808a0382  // fmopa za2.s, p0/M, p0/M, z28.s, z10.s\n"
+      ".inst 0x808b0383  // fmopa za3.s, p0/M, p0/M, z28.s, z11.s\n"
+      "ld1w { z28.s }, p0/Z, [x26]\n"
+      ".inst 0x808c02c0  // fmopa za0.s, p0/M, p0/M, z22.s, z12.s\n"
+      ".inst 0xa040c6a9  // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x21]\n"
+      ".inst 0x808d02c1  // fmopa za1.s, p0/M, p0/M, z22.s, z13.s\n"
+      ".inst 0x808e02c2  // fmopa za2.s, p0/M, p0/M, z22.s, z14.s\n"
+      ".inst 0x808f02c3  // fmopa za3.s, p0/M, p0/M, z22.s, z15.s\n"
+      "ld1w { z22.s }, p0/Z, [x26, #1, MUL VL]\n"
+      ".inst 0x808403c0  // fmopa za0.s, p0/M, p0/M, z30.s, z4.s\n"
+      ".inst 0xa041c6ad  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+      ".inst 0x808503c1  // fmopa za1.s, p0/M, p0/M, z30.s, z5.s\n"
+      ".inst 0x808603c2  // fmopa za2.s, p0/M, p0/M, z30.s, z6.s\n"
+      ".inst 0x808703c3  // fmopa za3.s, p0/M, p0/M, z30.s, z7.s\n"
+      "ld1w { z30.s }, p0/Z, [x26, #2, MUL VL]\n"
+      ".inst 0xa042c6a5  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n"
+      ".inst 0x80930280  // fmopa za0.s, p0/M, p0/M, z20.s, z19.s\n"
+      ".inst 0x80970281  // fmopa za1.s, p0/M, p0/M, z20.s, z23.s\n"
+      ".inst 0x809b0282  // fmopa za2.s, p0/M, p0/M, z20.s, z27.s\n"
+      ".inst 0x809f0283  // fmopa za3.s, p0/M, p0/M, z20.s, z31.s\n"
+      "ld1w { z20.s }, p0/Z, [x26, #3, MUL VL]\n"
       "addvl x26, x26, #4\n"
-      ".inst 0xa143c6b8  // ldnt1w { z16.s, z20.s, z24.s, z28.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n"
+      ".inst 0xa143c6bb  // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n"
       "addvl x21, x21, #16\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0x80930000  // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n"
-      ".inst 0x80970001  // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n"
-      ".inst 0x809b0002  // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n"
-      ".inst 0x809f0003  // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n"
-      ".inst 0x808201a0  // fmopa za0.s, p0/M, p0/M, z13.s, z2.s\n"
-      ".inst 0x808601a1  // fmopa za1.s, p0/M, p0/M, z13.s, z6.s\n"
-      ".inst 0x808a01a2  // fmopa za2.s, p0/M, p0/M, z13.s, z10.s\n"
-      ".inst 0x808e01a3  // fmopa za3.s, p0/M, p0/M, z13.s, z14.s\n"
-      ".inst 0x80830180  // fmopa za0.s, p0/M, p0/M, z12.s, z3.s\n"
-      ".inst 0x80870181  // fmopa za1.s, p0/M, p0/M, z12.s, z7.s\n"
-      ".inst 0x808b0182  // fmopa za2.s, p0/M, p0/M, z12.s, z11.s\n"
-      ".inst 0x808f0183  // fmopa za3.s, p0/M, p0/M, z12.s, z15.s\n"
-      ".inst 0x80900340  // fmopa za0.s, p0/M, p0/M, z26.s, z16.s\n"
-      ".inst 0x80940341  // fmopa za1.s, p0/M, p0/M, z26.s, z20.s\n"
-      ".inst 0x80980342  // fmopa za2.s, p0/M, p0/M, z26.s, z24.s\n"
-      ".inst 0x809c0343  // fmopa za3.s, p0/M, p0/M, z26.s, z28.s\n"
+      ".inst 0x80880380  // fmopa za0.s, p0/M, p0/M, z28.s, z8.s\n"
+      ".inst 0x80890381  // fmopa za1.s, p0/M, p0/M, z28.s, z9.s\n"
+      ".inst 0x808a0382  // fmopa za2.s, p0/M, p0/M, z28.s, z10.s\n"
+      ".inst 0x808b0383  // fmopa za3.s, p0/M, p0/M, z28.s, z11.s\n"
+      ".inst 0x808c02c0  // fmopa za0.s, p0/M, p0/M, z22.s, z12.s\n"
+      ".inst 0x808d02c1  // fmopa za1.s, p0/M, p0/M, z22.s, z13.s\n"
+      ".inst 0x808e02c2  // fmopa za2.s, p0/M, p0/M, z22.s, z14.s\n"
+      ".inst 0x808f02c3  // fmopa za3.s, p0/M, p0/M, z22.s, z15.s\n"
+      ".inst 0x808403c0  // fmopa za0.s, p0/M, p0/M, z30.s, z4.s\n"
+      ".inst 0x808503c1  // fmopa za1.s, p0/M, p0/M, z30.s, z5.s\n"
+      ".inst 0x808603c2  // fmopa za2.s, p0/M, p0/M, z30.s, z6.s\n"
+      ".inst 0x808703c3  // fmopa za3.s, p0/M, p0/M, z30.s, z7.s\n"
+      ".inst 0x80930280  // fmopa za0.s, p0/M, p0/M, z20.s, z19.s\n"
+      ".inst 0x80970281  // fmopa za1.s, p0/M, p0/M, z20.s, z23.s\n"
+      ".inst 0x809b0282  // fmopa za2.s, p0/M, p0/M, z20.s, z27.s\n"
+      ".inst 0x809f0283  // fmopa za3.s, p0/M, p0/M, z20.s, z31.s\n"
       "8:"  // K oddments
       "cbz x22, 10f\n"
       "9:"  // K oddments: Loop
-      "ld1w { z0.s }, p0/Z, [x26]\n"
+      "ld1w { z8.s }, p0/Z, [x26]\n"
       "subs x22, x22, #0x1\n"
       "addvl x26, x26, #1\n"
-      ".inst 0xa140c6b3  // ld1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21]\n"
+      ".inst 0xa140c6a3  // ld1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x21]\n"
       "addvl x21, x21, #4\n"
-      ".inst 0x80930000  // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n"
-      ".inst 0x80970001  // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n"
-      ".inst 0x809b0002  // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n"
-      ".inst 0x809f0003  // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n"
+      ".inst 0x80830100  // fmopa za0.s, p0/M, p0/M, z8.s, z3.s\n"
+      ".inst 0x80870101  // fmopa za1.s, p0/M, p0/M, z8.s, z7.s\n"
+      ".inst 0x808b0102  // fmopa za2.s, p0/M, p0/M, z8.s, z11.s\n"
+      ".inst 0x808f0103  // fmopa za3.s, p0/M, p0/M, z8.s, z15.s\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
       "tbz x15, #1, 14f\n"
@@ -240,25 +239,25 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
-      ".inst 0xa040c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
-      ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
-      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
-      ".inst 0xa041c5dc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
-      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
-      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
-      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
-      ".inst 0xa042c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
-      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xa043c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
-      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa040c5d4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa060c5b8  // st1w { z24.s-z27.s }, pn9.b, [x13]\n"
+      ".inst 0xa060c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
       "addvl x14, x14, #16\n"
-      ".inst 0xa061c5b4  // st1w { z20.s-z23.s }, pn9.b, [x13, #0x4, MUL VL]\n"
-      ".inst 0xa062c5bc  // st1w { z28.s-z31.s }, pn9.b, [x13, #0x8, MUL VL]\n"
-      ".inst 0xa063c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      ".inst 0xa061c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa062c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5a0  // st1w { z0.s-z3.s }, pn9.b, [x13, #0xc, MUL VL]\n"
       "addvl x13, x13, #16\n"
       "blt 11b\n"
       "b 24f\n"
@@ -266,15 +265,15 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
-      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
-      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
-      ".inst 0xa060c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13]\n"
-      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
       ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
-      ".inst 0xa061c5bc  // st1w { z28.s-z31.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa061c5b8  // st1w { z24.s-z27.s }, pn9.b, [x13, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa062c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa062c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13, #0x8, MUL VL]\n"
       ".inst 0xa063c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13, #0xc, MUL VL]\n"
       "addvl x13, x13, #16\n"
       "blt 13b\n"
@@ -312,18 +311,18 @@
       "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
       "cbz x20, 17f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
-      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
-      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      ".inst 0xa160c320  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa160c330  // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "beq 17f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa160c321  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+      ".inst 0xa160c331  // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "beq 17f\n"
-      ".inst 0xa160c322  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+      ".inst 0xa160c332  // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
       "subs x24, x24, x22\n"
@@ -332,66 +331,66 @@
       "18:"  // Store to output array: Skip activation: End
       "cntw x20\n"
       "cmp x24, x20\n"
-      "ld1rw { z23.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
       "csel x20, x24, x20, LT\n"
       "lsr x21, x20, #0x2\n"
-      "ld1rw { z16.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
       "mov x12, #0x0\n"
       "and x20, x20, #0x3\n"
       "cbz x21, 20f\n"
       "19:"  // Store to output array: Accumulator row 0 loop
-      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
-      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
-      ".inst 0xc1b0cae0  // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
-      ".inst 0xc1b0cae4  // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
-      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      ".inst 0xc1b0cae8  // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
-      ".inst 0xc1b0caec  // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
-      ".inst 0xa160c320  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c834  // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c83c  // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+      ".inst 0xa160c330  // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "add x12, x12, #0x4\n"
-      ".inst 0xa160c321  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+      ".inst 0xa160c331  // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xa160c322  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+      ".inst 0xa160c332  // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
-      ".inst 0xa160c323  // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x25]\n"
+      ".inst 0xa160c333  // st1w { z19.s, z23.s, z27.s, z31.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "blt 19b\n"
       "20:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 21f\n"
-      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
-      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
-      ".inst 0xc1b0cae0  // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
-      ".inst 0xc1b0cae4  // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
-      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      ".inst 0xc1b0cae8  // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
-      ".inst 0xc1b0caec  // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c834  // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c83c  // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa160c320  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n"
+      ".inst 0xa160c330  // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "beq 21f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa160c321  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n"
+      ".inst 0xa160c331  // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n"
       "add x25, x25, x23\n"
       "beq 21f\n"
-      ".inst 0xa160c322  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n"
+      ".inst 0xa160c332  // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n"
       "21:"  // Store to output array: Accumulator row 0 oddments: End
       "22:"  // Store to output array: End
       "tbz x15, #0, 24f\n"
       "mov x12, #0x0\n"
       "cntw x20\n"
       "23:"  // Store to output array: Refill accumulators: Loop
-      ".inst 0xa040c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n"
-      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa040c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
       ".inst 0xa041c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
       ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
       ".inst 0xa042c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
       ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa043c5c8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
-      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa043c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x14, x14, #16\n"
@@ -415,4 +414,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
index 51e8c43..9bc1f83 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 
 #include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 2, 2, 1> transforms = {};
 
-  cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
index 87d7827..3c47504 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -113,14 +112,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "1:"  // Initial accumulator load from buffer: Loop
-      ".inst 0xa040c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
-      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
-      ".inst 0xa041c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa042c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
-      ".inst 0xa043c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa040c5e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840501  // mova za1h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa042c5f4  // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -138,12 +137,12 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      "fmov z21.s, #1.0\n"
-      ".inst 0xa00a428f  // ldnt1w { z14.s-z15.s }, p8/Z, [x20, x10, LSL #2]\n"
-      ".inst 0x808e02a0  // fmopa za0.s, p0/M, p0/M, z21.s, z14.s\n"
-      ".inst 0x808f02a1  // fmopa za1.s, p0/M, p0/M, z21.s, z15.s\n"
-      ".inst 0x808e02a2  // fmopa za2.s, p0/M, p0/M, z21.s, z14.s\n"
-      ".inst 0x808f02a3  // fmopa za3.s, p0/M, p0/M, z21.s, z15.s\n"
+      "fmov z12.s, #1.0\n"
+      ".inst 0xa10a4289  // ldnt1w { z1.s, z9.s }, p8/Z, [x20, x10, LSL #2]\n"
+      ".inst 0x80810180  // fmopa za0.s, p0/M, p0/M, z12.s, z1.s\n"
+      ".inst 0x80890181  // fmopa za1.s, p0/M, p0/M, z12.s, z9.s\n"
+      ".inst 0x80810182  // fmopa za2.s, p0/M, p0/M, z12.s, z1.s\n"
+      ".inst 0x80890183  // fmopa za3.s, p0/M, p0/M, z12.s, z9.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x10\n"
       "mov x21, x11\n"
@@ -164,75 +163,75 @@
       "madd x21, x10, x20, x21\n"  // bptr = B + n * kstride_bytes
       "cbz x23, 8f\n"
       "subs x23, x23, #0x1\n"
-      ".inst 0xa1404767  // ld1w { z7.s, z15.s }, pn9.b/Z, [x27]\n"
-      ".inst 0xa14046bf  // ldnt1w { z23.s, z31.s }, pn9.b/Z, [x21]\n"
-      ".inst 0xa0414768  // ld1w { z8.s-z9.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
-      ".inst 0xa04146a3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n"
-      ".inst 0xa1424772  // ld1w { z18.s, z26.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0xa04246b1  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
-      ".inst 0xa1434776  // ld1w { z22.s, z30.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      ".inst 0xa0404772  // ld1w { z18.s-z19.s }, pn9.b/Z, [x27]\n"
+      ".inst 0xa04046a3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21]\n"
+      ".inst 0xa0414764  // ld1w { z4.s-z5.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa04146bb  // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n"
+      ".inst 0xa042476a  // ld1w { z10.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa04246b5  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+      ".inst 0xa0434766  // ld1w { z6.s-z7.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
       "addvl x27, x27, #8\n"
-      ".inst 0xa14346ac  // ldnt1w { z4.s, z12.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n"
+      ".inst 0xa04346a9  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n"
       "addvl x21, x21, #8\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0x809700e0  // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n"
+      ".inst 0x80820240  // fmopa za0.s, p0/M, p0/M, z18.s, z2.s\n"
       "subs x23, x23, #0x1\n"
-      ".inst 0x809f00e1  // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n"
-      ".inst 0x809701e2  // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n"
-      ".inst 0x809f01e3  // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n"
-      ".inst 0xa1404767  // ld1w { z7.s, z15.s }, pn9.b/Z, [x27]\n"
-      ".inst 0x80820100  // fmopa za0.s, p0/M, p0/M, z8.s, z2.s\n"
-      ".inst 0xa14046bf  // ldnt1w { z23.s, z31.s }, pn9.b/Z, [x21]\n"
-      ".inst 0x80830101  // fmopa za1.s, p0/M, p0/M, z8.s, z3.s\n"
-      ".inst 0x80820122  // fmopa za2.s, p0/M, p0/M, z9.s, z2.s\n"
-      ".inst 0x80830123  // fmopa za3.s, p0/M, p0/M, z9.s, z3.s\n"
-      ".inst 0xa0414768  // ld1w { z8.s-z9.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
-      ".inst 0x80900240  // fmopa za0.s, p0/M, p0/M, z18.s, z16.s\n"
-      ".inst 0xa04146a3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n"
-      ".inst 0x80910241  // fmopa za1.s, p0/M, p0/M, z18.s, z17.s\n"
-      ".inst 0x80900342  // fmopa za2.s, p0/M, p0/M, z26.s, z16.s\n"
-      ".inst 0x80910343  // fmopa za3.s, p0/M, p0/M, z26.s, z17.s\n"
-      ".inst 0xa1424772  // ld1w { z18.s, z26.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0xa04246b1  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
-      ".inst 0x808402c0  // fmopa za0.s, p0/M, p0/M, z22.s, z4.s\n"
-      ".inst 0x808c02c1  // fmopa za1.s, p0/M, p0/M, z22.s, z12.s\n"
-      ".inst 0x808403c2  // fmopa za2.s, p0/M, p0/M, z30.s, z4.s\n"
-      ".inst 0x808c03c3  // fmopa za3.s, p0/M, p0/M, z30.s, z12.s\n"
-      ".inst 0xa1434776  // ld1w { z22.s, z30.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      ".inst 0x80830241  // fmopa za1.s, p0/M, p0/M, z18.s, z3.s\n"
+      ".inst 0x80820262  // fmopa za2.s, p0/M, p0/M, z19.s, z2.s\n"
+      ".inst 0x80830263  // fmopa za3.s, p0/M, p0/M, z19.s, z3.s\n"
+      ".inst 0xa0404772  // ld1w { z18.s-z19.s }, pn9.b/Z, [x27]\n"
+      ".inst 0x809a0080  // fmopa za0.s, p0/M, p0/M, z4.s, z26.s\n"
+      ".inst 0xa04046a3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21]\n"
+      ".inst 0x809b0081  // fmopa za1.s, p0/M, p0/M, z4.s, z27.s\n"
+      ".inst 0x809a00a2  // fmopa za2.s, p0/M, p0/M, z5.s, z26.s\n"
+      ".inst 0x809b00a3  // fmopa za3.s, p0/M, p0/M, z5.s, z27.s\n"
+      ".inst 0xa0414764  // ld1w { z4.s-z5.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0x80940140  // fmopa za0.s, p0/M, p0/M, z10.s, z20.s\n"
+      ".inst 0xa04146bb  // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n"
+      ".inst 0x80950141  // fmopa za1.s, p0/M, p0/M, z10.s, z21.s\n"
+      ".inst 0x80940162  // fmopa za2.s, p0/M, p0/M, z11.s, z20.s\n"
+      ".inst 0x80950163  // fmopa za3.s, p0/M, p0/M, z11.s, z21.s\n"
+      ".inst 0xa042476a  // ld1w { z10.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa04246b5  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n"
+      ".inst 0x808800c0  // fmopa za0.s, p0/M, p0/M, z6.s, z8.s\n"
+      ".inst 0x808900c1  // fmopa za1.s, p0/M, p0/M, z6.s, z9.s\n"
+      ".inst 0x808800e2  // fmopa za2.s, p0/M, p0/M, z7.s, z8.s\n"
+      ".inst 0x808900e3  // fmopa za3.s, p0/M, p0/M, z7.s, z9.s\n"
+      ".inst 0xa0434766  // ld1w { z6.s-z7.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
       "addvl x27, x27, #8\n"
-      ".inst 0xa14346ac  // ldnt1w { z4.s, z12.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n"
+      ".inst 0xa04346a9  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n"
       "addvl x21, x21, #8\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0x809700e0  // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n"
-      ".inst 0x809f00e1  // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n"
-      ".inst 0x809701e2  // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n"
-      ".inst 0x809f01e3  // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n"
-      ".inst 0x80820100  // fmopa za0.s, p0/M, p0/M, z8.s, z2.s\n"
-      ".inst 0x80830101  // fmopa za1.s, p0/M, p0/M, z8.s, z3.s\n"
-      ".inst 0x80820122  // fmopa za2.s, p0/M, p0/M, z9.s, z2.s\n"
-      ".inst 0x80830123  // fmopa za3.s, p0/M, p0/M, z9.s, z3.s\n"
-      ".inst 0x80900240  // fmopa za0.s, p0/M, p0/M, z18.s, z16.s\n"
-      ".inst 0x80910241  // fmopa za1.s, p0/M, p0/M, z18.s, z17.s\n"
-      ".inst 0x80900342  // fmopa za2.s, p0/M, p0/M, z26.s, z16.s\n"
-      ".inst 0x80910343  // fmopa za3.s, p0/M, p0/M, z26.s, z17.s\n"
-      ".inst 0x808402c0  // fmopa za0.s, p0/M, p0/M, z22.s, z4.s\n"
-      ".inst 0x808c02c1  // fmopa za1.s, p0/M, p0/M, z22.s, z12.s\n"
-      ".inst 0x808403c2  // fmopa za2.s, p0/M, p0/M, z30.s, z4.s\n"
-      ".inst 0x808c03c3  // fmopa za3.s, p0/M, p0/M, z30.s, z12.s\n"
+      ".inst 0x80820240  // fmopa za0.s, p0/M, p0/M, z18.s, z2.s\n"
+      ".inst 0x80830241  // fmopa za1.s, p0/M, p0/M, z18.s, z3.s\n"
+      ".inst 0x80820262  // fmopa za2.s, p0/M, p0/M, z19.s, z2.s\n"
+      ".inst 0x80830263  // fmopa za3.s, p0/M, p0/M, z19.s, z3.s\n"
+      ".inst 0x809a0080  // fmopa za0.s, p0/M, p0/M, z4.s, z26.s\n"
+      ".inst 0x809b0081  // fmopa za1.s, p0/M, p0/M, z4.s, z27.s\n"
+      ".inst 0x809a00a2  // fmopa za2.s, p0/M, p0/M, z5.s, z26.s\n"
+      ".inst 0x809b00a3  // fmopa za3.s, p0/M, p0/M, z5.s, z27.s\n"
+      ".inst 0x80940140  // fmopa za0.s, p0/M, p0/M, z10.s, z20.s\n"
+      ".inst 0x80950141  // fmopa za1.s, p0/M, p0/M, z10.s, z21.s\n"
+      ".inst 0x80940162  // fmopa za2.s, p0/M, p0/M, z11.s, z20.s\n"
+      ".inst 0x80950163  // fmopa za3.s, p0/M, p0/M, z11.s, z21.s\n"
+      ".inst 0x808800c0  // fmopa za0.s, p0/M, p0/M, z6.s, z8.s\n"
+      ".inst 0x808900c1  // fmopa za1.s, p0/M, p0/M, z6.s, z9.s\n"
+      ".inst 0x808800e2  // fmopa za2.s, p0/M, p0/M, z7.s, z8.s\n"
+      ".inst 0x808900e3  // fmopa za3.s, p0/M, p0/M, z7.s, z9.s\n"
       "8:"  // K oddments
       "cbz x22, 10f\n"
       "9:"  // K oddments: Loop
-      ".inst 0xa1404767  // ld1w { z7.s, z15.s }, pn9.b/Z, [x27]\n"
+      ".inst 0xa040477e  // ld1w { z30.s-z31.s }, pn9.b/Z, [x27]\n"
       "subs x22, x22, #0x1\n"
       "addvl x27, x27, #2\n"
-      ".inst 0xa14046b7  // ld1w { z23.s, z31.s }, pn9.b/Z, [x21]\n"
+      ".inst 0xa14046a5  // ld1w { z5.s, z13.s }, pn9.b/Z, [x21]\n"
       "addvl x21, x21, #2\n"
-      ".inst 0x809700e0  // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n"
-      ".inst 0x809f00e1  // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n"
-      ".inst 0x809701e2  // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n"
-      ".inst 0x809f01e3  // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n"
+      ".inst 0x808503c0  // fmopa za0.s, p0/M, p0/M, z30.s, z5.s\n"
+      ".inst 0x808d03c1  // fmopa za1.s, p0/M, p0/M, z30.s, z13.s\n"
+      ".inst 0x808503e2  // fmopa za2.s, p0/M, p0/M, z31.s, z5.s\n"
+      ".inst 0x808d03e3  // fmopa za3.s, p0/M, p0/M, z31.s, z13.s\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
       "tbz x16, #1, 14f\n"
@@ -241,24 +240,24 @@
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
       ".inst 0xa040c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
-      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
       ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
-      ".inst 0xa041c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
-      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
-      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
-      ".inst 0xa042c5f8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
-      ".inst 0xa043c5f8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840703  // mova za3h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa060c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
+      ".inst 0xa060c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
       "addvl x15, x15, #16\n"
-      ".inst 0xa061c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0x4, MUL VL]\n"
-      ".inst 0xa062c5d4  // st1w { z20.s-z23.s }, pn9.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa061c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c5dc  // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 11b\n"
       "b 30f\n"
@@ -266,16 +265,16 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
-      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
-      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
-      ".inst 0xa060c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
-      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
-      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
-      ".inst 0xa061c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa062c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa062c5d8  // st1w { z24.s-z27.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 13b\n"
       "b 30f\n"
@@ -310,16 +309,16 @@
       "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
       "cbz x20, 17f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
-      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
-      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
+      ".inst 0xc0860414  // mova { z20.s-z23.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa1604354  // st1w { z20.s, z28.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "beq 17f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
+      ".inst 0xa1604355  // st1w { z21.s, z29.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "beq 17f\n"
-      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
+      ".inst 0xa1604356  // st1w { z22.s, z30.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
       "subs x25, x25, x22\n"
@@ -347,16 +346,16 @@
       "19:"  // Store to output array: Skip activation: Accumulator row 1 oddments
       "cbz x20, 20f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
-      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
-      ".inst 0xa1604354  // st1w { z20.s, z28.s }, p8, [x26]\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "beq 20f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa1604355  // st1w { z21.s, z29.s }, p8, [x26]\n"
+      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "beq 20f\n"
-      ".inst 0xa1604356  // st1w { z22.s, z30.s }, p8, [x26]\n"
+      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "20:"  // Store to output array: Skip activation: Accumulator row 1 oddments: End
       "subs x25, x25, x22\n"
@@ -365,44 +364,44 @@
       "21:"  // Store to output array: Skip activation: End
       "cntw x23\n"
       "cmp x25, x23\n"
-      "ld1rw { z21.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
       "csel x22, x25, x23, LT\n"
       "lsr x21, x22, #0x2\n"
-      "ld1rw { z20.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
       "mov x12, #0x0\n"
       "and x20, x22, #0x3\n"
       "cbz x21, 23f\n"
       "22:"  // Store to output array: Accumulator row 0 loop
-      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
-      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
-      ".inst 0xc1b4caa4  // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
-      ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
-      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
+      ".inst 0xc0860414  // mova { z20.s-z23.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c834  // fclamp { z20.s-z23.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c83c  // fclamp { z28.s-z31.s }, z1.s, z0.s\n"
+      ".inst 0xa1604354  // st1w { z20.s, z28.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
+      ".inst 0xa1604355  // st1w { z21.s, z29.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
+      ".inst 0xa1604356  // st1w { z22.s, z30.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
-      ".inst 0xa1604347  // st1w { z7.s, z15.s }, p8, [x26]\n"
+      ".inst 0xa1604357  // st1w { z23.s, z31.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "blt 22b\n"
       "23:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 24f\n"
-      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
-      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
-      ".inst 0xc1b4caa0  // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
-      ".inst 0xc1b4caa8  // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa1604340  // st1w { z0.s, z8.s }, p8, [x26]\n"
+      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "beq 24f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa1604341  // st1w { z1.s, z9.s }, p8, [x26]\n"
+      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "beq 24f\n"
-      ".inst 0xa1604342  // st1w { z2.s, z10.s }, p8, [x26]\n"
+      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "24:"  // Store to output array: Accumulator row 0 oddments: End
       "subs x25, x25, x22\n"
@@ -416,8 +415,8 @@
       "25:"  // Store to output array: Accumulator row 1 loop
       ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
       ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
-      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
-      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
       ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
@@ -433,8 +432,8 @@
       "cbz x20, 27f\n"
       ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
       ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
-      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
-      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc1a0c830  // fclamp { z16.s-z19.s }, z1.s, z0.s\n"
+      ".inst 0xc1a0c838  // fclamp { z24.s-z27.s }, z1.s, z0.s\n"
       "subs x20, x20, #0x1\n"
       ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
       "add x26, x26, x24\n"
@@ -450,14 +449,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "29:"  // Store to output array: Refill accumulators: Loop
-      ".inst 0xa040c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
-      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa040c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
       ".inst 0xa041c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
       ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
       ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
       ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa043c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa043c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -481,4 +480,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
index a315ebb..165e25d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 
 #include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 4, 1, 1> transforms = {};
 
-  cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
index 291a7ce..ae1f812 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -113,14 +112,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "1:"  // Initial accumulator load from buffer: Loop
-      ".inst 0xa040c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
-      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
-      ".inst 0xa041c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
-      ".inst 0xa042c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
-      ".inst 0xa043c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa040c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa042c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -138,12 +137,12 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      "fmov z8.s, #1.0\n"
-      "ldnt1w { z27.s }, p0/Z, [x20, x10, LSL #2]\n"
-      ".inst 0x809b2500  // fmopa za0.s, p1/M, p1/M, z8.s, z27.s\n"
-      ".inst 0x809b2501  // fmopa za1.s, p1/M, p1/M, z8.s, z27.s\n"
-      ".inst 0x809b2502  // fmopa za2.s, p1/M, p1/M, z8.s, z27.s\n"
-      ".inst 0x809b2503  // fmopa za3.s, p1/M, p1/M, z8.s, z27.s\n"
+      "fmov z11.s, #1.0\n"
+      "ldnt1w { z13.s }, p0/Z, [x20, x10, LSL #2]\n"
+      ".inst 0x808d2560  // fmopa za0.s, p1/M, p1/M, z11.s, z13.s\n"
+      ".inst 0x808d2561  // fmopa za1.s, p1/M, p1/M, z11.s, z13.s\n"
+      ".inst 0x808d2562  // fmopa za2.s, p1/M, p1/M, z11.s, z13.s\n"
+      ".inst 0x808d2563  // fmopa za3.s, p1/M, p1/M, z11.s, z13.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x10\n"
       "mov x21, x11\n"
@@ -164,75 +163,75 @@
       "madd x21, x10, x20, x21\n"  // bptr = B + n * kstride_bytes
       "cbz x23, 8f\n"
       "subs x23, x23, #0x1\n"
-      ".inst 0xa040c364  // ld1w { z4.s-z7.s }, pn8.b/Z, [x27]\n"
-      "ldnt1w { z29.s }, p1/Z, [x21]\n"
-      ".inst 0xa041c36c  // ld1w { z12.s-z15.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
-      "ldnt1w { z23.s }, p1/Z, [x21, #1, MUL VL]\n"
-      ".inst 0xa042c360  // ld1w { z0.s-z3.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
-      "ldnt1w { z21.s }, p1/Z, [x21, #2, MUL VL]\n"
-      ".inst 0xa143c372  // ld1w { z18.s, z22.s, z26.s, z30.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xa140c360  // ld1w { z0.s, z4.s, z8.s, z12.s }, pn8.b/Z, [x27]\n"
+      "ldnt1w { z19.s }, p1/Z, [x21]\n"
+      ".inst 0xa141c371  // ld1w { z17.s, z21.s, z25.s, z29.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      "ldnt1w { z22.s }, p1/Z, [x21, #1, MUL VL]\n"
+      ".inst 0xa142c370  // ld1w { z16.s, z20.s, z24.s, z28.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1w { z23.s }, p1/Z, [x21, #2, MUL VL]\n"
+      ".inst 0xa143c363  // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
       "addvl x27, x27, #16\n"
-      "ldnt1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ldnt1w { z2.s }, p1/Z, [x21, #3, MUL VL]\n"
       "addvl x21, x21, #4\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0x809d2480  // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n"
+      ".inst 0x80932400  // fmopa za0.s, p1/M, p1/M, z0.s, z19.s\n"
       "subs x23, x23, #0x1\n"
-      ".inst 0x809d24a1  // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n"
-      ".inst 0x809d24c2  // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n"
-      ".inst 0x809d24e3  // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n"
-      ".inst 0xa040c364  // ld1w { z4.s-z7.s }, pn8.b/Z, [x27]\n"
-      ".inst 0x80972580  // fmopa za0.s, p1/M, p1/M, z12.s, z23.s\n"
-      "ldnt1w { z29.s }, p1/Z, [x21]\n"
-      ".inst 0x809725a1  // fmopa za1.s, p1/M, p1/M, z13.s, z23.s\n"
-      ".inst 0x809725c2  // fmopa za2.s, p1/M, p1/M, z14.s, z23.s\n"
-      ".inst 0x809725e3  // fmopa za3.s, p1/M, p1/M, z15.s, z23.s\n"
-      ".inst 0xa041c36c  // ld1w { z12.s-z15.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0x80952400  // fmopa za0.s, p1/M, p1/M, z0.s, z21.s\n"
-      "ldnt1w { z23.s }, p1/Z, [x21, #1, MUL VL]\n"
-      ".inst 0x80952421  // fmopa za1.s, p1/M, p1/M, z1.s, z21.s\n"
-      ".inst 0x80952442  // fmopa za2.s, p1/M, p1/M, z2.s, z21.s\n"
-      ".inst 0x80952463  // fmopa za3.s, p1/M, p1/M, z3.s, z21.s\n"
-      ".inst 0xa042c360  // ld1w { z0.s-z3.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
-      "ldnt1w { z21.s }, p1/Z, [x21, #2, MUL VL]\n"
-      ".inst 0x809b2640  // fmopa za0.s, p1/M, p1/M, z18.s, z27.s\n"
-      ".inst 0x809b26c1  // fmopa za1.s, p1/M, p1/M, z22.s, z27.s\n"
-      ".inst 0x809b2742  // fmopa za2.s, p1/M, p1/M, z26.s, z27.s\n"
-      ".inst 0x809b27c3  // fmopa za3.s, p1/M, p1/M, z30.s, z27.s\n"
-      ".inst 0xa143c372  // ld1w { z18.s, z22.s, z26.s, z30.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0x80932481  // fmopa za1.s, p1/M, p1/M, z4.s, z19.s\n"
+      ".inst 0x80932502  // fmopa za2.s, p1/M, p1/M, z8.s, z19.s\n"
+      ".inst 0x80932583  // fmopa za3.s, p1/M, p1/M, z12.s, z19.s\n"
+      ".inst 0xa140c360  // ld1w { z0.s, z4.s, z8.s, z12.s }, pn8.b/Z, [x27]\n"
+      ".inst 0x80962620  // fmopa za0.s, p1/M, p1/M, z17.s, z22.s\n"
+      "ldnt1w { z19.s }, p1/Z, [x21]\n"
+      ".inst 0x809626a1  // fmopa za1.s, p1/M, p1/M, z21.s, z22.s\n"
+      ".inst 0x80962722  // fmopa za2.s, p1/M, p1/M, z25.s, z22.s\n"
+      ".inst 0x809627a3  // fmopa za3.s, p1/M, p1/M, z29.s, z22.s\n"
+      ".inst 0xa141c371  // ld1w { z17.s, z21.s, z25.s, z29.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0x80972600  // fmopa za0.s, p1/M, p1/M, z16.s, z23.s\n"
+      "ldnt1w { z22.s }, p1/Z, [x21, #1, MUL VL]\n"
+      ".inst 0x80972681  // fmopa za1.s, p1/M, p1/M, z20.s, z23.s\n"
+      ".inst 0x80972702  // fmopa za2.s, p1/M, p1/M, z24.s, z23.s\n"
+      ".inst 0x80972783  // fmopa za3.s, p1/M, p1/M, z28.s, z23.s\n"
+      ".inst 0xa142c370  // ld1w { z16.s, z20.s, z24.s, z28.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1w { z23.s }, p1/Z, [x21, #2, MUL VL]\n"
+      ".inst 0x80822460  // fmopa za0.s, p1/M, p1/M, z3.s, z2.s\n"
+      ".inst 0x808224e1  // fmopa za1.s, p1/M, p1/M, z7.s, z2.s\n"
+      ".inst 0x80822562  // fmopa za2.s, p1/M, p1/M, z11.s, z2.s\n"
+      ".inst 0x808225e3  // fmopa za3.s, p1/M, p1/M, z15.s, z2.s\n"
+      ".inst 0xa143c363  // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
       "addvl x27, x27, #16\n"
-      "ldnt1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ldnt1w { z2.s }, p1/Z, [x21, #3, MUL VL]\n"
       "addvl x21, x21, #4\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0x809d2480  // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n"
-      ".inst 0x809d24a1  // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n"
-      ".inst 0x809d24c2  // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n"
-      ".inst 0x809d24e3  // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n"
-      ".inst 0x80972580  // fmopa za0.s, p1/M, p1/M, z12.s, z23.s\n"
-      ".inst 0x809725a1  // fmopa za1.s, p1/M, p1/M, z13.s, z23.s\n"
-      ".inst 0x809725c2  // fmopa za2.s, p1/M, p1/M, z14.s, z23.s\n"
-      ".inst 0x809725e3  // fmopa za3.s, p1/M, p1/M, z15.s, z23.s\n"
-      ".inst 0x80952400  // fmopa za0.s, p1/M, p1/M, z0.s, z21.s\n"
-      ".inst 0x80952421  // fmopa za1.s, p1/M, p1/M, z1.s, z21.s\n"
-      ".inst 0x80952442  // fmopa za2.s, p1/M, p1/M, z2.s, z21.s\n"
-      ".inst 0x80952463  // fmopa za3.s, p1/M, p1/M, z3.s, z21.s\n"
-      ".inst 0x809b2640  // fmopa za0.s, p1/M, p1/M, z18.s, z27.s\n"
-      ".inst 0x809b26c1  // fmopa za1.s, p1/M, p1/M, z22.s, z27.s\n"
-      ".inst 0x809b2742  // fmopa za2.s, p1/M, p1/M, z26.s, z27.s\n"
-      ".inst 0x809b27c3  // fmopa za3.s, p1/M, p1/M, z30.s, z27.s\n"
+      ".inst 0x80932400  // fmopa za0.s, p1/M, p1/M, z0.s, z19.s\n"
+      ".inst 0x80932481  // fmopa za1.s, p1/M, p1/M, z4.s, z19.s\n"
+      ".inst 0x80932502  // fmopa za2.s, p1/M, p1/M, z8.s, z19.s\n"
+      ".inst 0x80932583  // fmopa za3.s, p1/M, p1/M, z12.s, z19.s\n"
+      ".inst 0x80962620  // fmopa za0.s, p1/M, p1/M, z17.s, z22.s\n"
+      ".inst 0x809626a1  // fmopa za1.s, p1/M, p1/M, z21.s, z22.s\n"
+      ".inst 0x80962722  // fmopa za2.s, p1/M, p1/M, z25.s, z22.s\n"
+      ".inst 0x809627a3  // fmopa za3.s, p1/M, p1/M, z29.s, z22.s\n"
+      ".inst 0x80972600  // fmopa za0.s, p1/M, p1/M, z16.s, z23.s\n"
+      ".inst 0x80972681  // fmopa za1.s, p1/M, p1/M, z20.s, z23.s\n"
+      ".inst 0x80972702  // fmopa za2.s, p1/M, p1/M, z24.s, z23.s\n"
+      ".inst 0x80972783  // fmopa za3.s, p1/M, p1/M, z28.s, z23.s\n"
+      ".inst 0x80822460  // fmopa za0.s, p1/M, p1/M, z3.s, z2.s\n"
+      ".inst 0x808224e1  // fmopa za1.s, p1/M, p1/M, z7.s, z2.s\n"
+      ".inst 0x80822562  // fmopa za2.s, p1/M, p1/M, z11.s, z2.s\n"
+      ".inst 0x808225e3  // fmopa za3.s, p1/M, p1/M, z15.s, z2.s\n"
       "8:"  // K oddments
       "cbz x22, 10f\n"
       "9:"  // K oddments: Loop
-      ".inst 0xa040c364  // ld1w { z4.s-z7.s }, pn8.b/Z, [x27]\n"
+      ".inst 0xa140c373  // ld1w { z19.s, z23.s, z27.s, z31.s }, pn8.b/Z, [x27]\n"
       "subs x22, x22, #0x1\n"
       "addvl x27, x27, #4\n"
-      "ld1w { z29.s }, p1/Z, [x21]\n"
+      "ld1w { z11.s }, p1/Z, [x21]\n"
       "addvl x21, x21, #1\n"
-      ".inst 0x809d2480  // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n"
-      ".inst 0x809d24a1  // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n"
-      ".inst 0x809d24c2  // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n"
-      ".inst 0x809d24e3  // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n"
+      ".inst 0x808b2660  // fmopa za0.s, p1/M, p1/M, z19.s, z11.s\n"
+      ".inst 0x808b26e1  // fmopa za1.s, p1/M, p1/M, z23.s, z11.s\n"
+      ".inst 0x808b2762  // fmopa za2.s, p1/M, p1/M, z27.s, z11.s\n"
+      ".inst 0x808b27e3  // fmopa za3.s, p1/M, p1/M, z31.s, z11.s\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
       "tbz x16, #1, 14f\n"
@@ -240,25 +239,25 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
-      ".inst 0xa040c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xa040c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
       ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
-      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
       ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
-      ".inst 0xa041c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
-      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
-      ".inst 0xa042c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840502  // mova za2h.s[x12], { z8.s-z11.s }\n"
-      ".inst 0xa043c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840501  // mova za1h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       ".inst 0xa060c1d8  // st1w { z24.s-z27.s }, pn8.b, [x14]\n"
       "addvl x15, x15, #16\n"
       ".inst 0xa061c1c4  // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n"
-      ".inst 0xa062c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa062c1d0  // st1w { z16.s-z19.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 11b\n"
       "b 42f\n"
@@ -267,15 +266,15 @@
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
       ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
-      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
       ".inst 0xa060c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14]\n"
       ".inst 0xc0860440  // mova { z0.s-z3.s }, za2h.s[x12]\n"
-      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
-      ".inst 0xa061c1d8  // st1w { z24.s-z27.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa061c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       ".inst 0xa062c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c1c8  // st1w { z8.s-z11.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa063c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 13b\n"
       "b 42f\n"
@@ -294,16 +293,16 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 16f\n"
       "15:"  // Store to output array: Skip activation: Accumulator row 0 loop
-      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
-      "st1w { z16.s }, p0, [x26]\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      "st1w { z4.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1w { z17.s }, p0, [x26]\n"
+      "st1w { z5.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      "st1w { z18.s }, p0, [x26]\n"
+      "st1w { z6.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z19.s }, p0, [x26]\n"
+      "st1w { z7.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 15b\n"
       "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
@@ -329,30 +328,30 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 19f\n"
       "18:"  // Store to output array: Skip activation: Accumulator row 1 loop
-      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
-      "st1w { z16.s }, p0, [x26]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      "st1w { z8.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1w { z17.s }, p0, [x26]\n"
+      "st1w { z9.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      "st1w { z18.s }, p0, [x26]\n"
+      "st1w { z10.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z19.s }, p0, [x26]\n"
+      "st1w { z11.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 18b\n"
       "19:"  // Store to output array: Skip activation: Accumulator row 1 oddments
       "cbz x20, 20f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
-      "st1w { z4.s }, p0, [x26]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      "st1w { z24.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 20f\n"
       "subs x20, x20, #0x1\n"
-      "st1w { z5.s }, p0, [x26]\n"
+      "st1w { z25.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 20f\n"
-      "st1w { z6.s }, p0, [x26]\n"
+      "st1w { z26.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "20:"  // Store to output array: Skip activation: Accumulator row 1 oddments: End
       "subs x25, x25, x22\n"
@@ -364,30 +363,30 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 22f\n"
       "21:"  // Store to output array: Skip activation: Accumulator row 2 loop
-      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
-      "st1w { z16.s }, p0, [x26]\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      "st1w { z4.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1w { z17.s }, p0, [x26]\n"
+      "st1w { z5.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      "st1w { z18.s }, p0, [x26]\n"
+      "st1w { z6.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z19.s }, p0, [x26]\n"
+      "st1w { z7.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 21b\n"
       "22:"  // Store to output array: Skip activation: Accumulator row 2 oddments
       "cbz x20, 23f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
-      "st1w { z20.s }, p0, [x26]\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      "st1w { z12.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 23f\n"
       "subs x20, x20, #0x1\n"
-      "st1w { z21.s }, p0, [x26]\n"
+      "st1w { z13.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 23f\n"
-      "st1w { z22.s }, p0, [x26]\n"
+      "st1w { z14.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "23:"  // Store to output array: Skip activation: Accumulator row 2 oddments: End
       "subs x25, x25, x22\n"
@@ -399,30 +398,30 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 25f\n"
       "24:"  // Store to output array: Skip activation: Accumulator row 3 loop
-      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
-      "st1w { z4.s }, p0, [x26]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1w { z5.s }, p0, [x26]\n"
+      "st1w { z17.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      "st1w { z6.s }, p0, [x26]\n"
+      "st1w { z18.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z7.s }, p0, [x26]\n"
+      "st1w { z19.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 24b\n"
       "25:"  // Store to output array: Skip activation: Accumulator row 3 oddments
       "cbz x20, 26f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      "st1w { z12.s }, p0, [x26]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 26f\n"
       "subs x20, x20, #0x1\n"
-      "st1w { z13.s }, p0, [x26]\n"
+      "st1w { z17.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 26f\n"
-      "st1w { z14.s }, p0, [x26]\n"
+      "st1w { z18.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "26:"  // Store to output array: Skip activation: Accumulator row 3 oddments: End
       "subs x25, x25, x22\n"
@@ -431,40 +430,40 @@
       "27:"  // Store to output array: Skip activation: End
       "cntw x23\n"
       "cmp x25, x23\n"
-      "ld1rw { z25.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "ld1rw { z21.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
       "csel x22, x25, x23, LT\n"
       "lsr x21, x22, #0x2\n"
-      "ld1rw { z24.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "ld1rw { z20.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
       "mov x12, #0x0\n"
       "and x20, x22, #0x3\n"
       "cbz x21, 29f\n"
       "28:"  // Store to output array: Accumulator row 0 loop
-      ".inst 0xc0860414  // mova { z20.s-z23.s }, za0h.s[x12]\n"
-      ".inst 0xc1b8cb34  // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
-      "st1w { z20.s }, p0, [x26]\n"
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      ".inst 0xc1b4cabc  // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1w { z28.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1w { z21.s }, p0, [x26]\n"
+      "st1w { z29.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      "st1w { z22.s }, p0, [x26]\n"
+      "st1w { z30.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z23.s }, p0, [x26]\n"
+      "st1w { z31.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 28b\n"
       "29:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 30f\n"
-      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1b8cb28  // fclamp { z8.s-z11.s }, z25.s, z24.s\n"
-      "st1w { z8.s }, p0, [x26]\n"
+      ".inst 0xc1b4cabc  // fclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1w { z28.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 30f\n"
       "subs x20, x20, #0x1\n"
-      "st1w { z9.s }, p0, [x26]\n"
+      "st1w { z29.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 30f\n"
-      "st1w { z10.s }, p0, [x26]\n"
+      "st1w { z30.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "30:"  // Store to output array: Accumulator row 0 oddments: End
       "subs x25, x25, x22\n"
@@ -476,24 +475,24 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 32f\n"
       "31:"  // Store to output array: Accumulator row 1 loop
-      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
-      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
-      "st1w { z16.s }, p0, [x26]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1b4caa4  // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+      "st1w { z4.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1w { z17.s }, p0, [x26]\n"
+      "st1w { z5.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      "st1w { z18.s }, p0, [x26]\n"
+      "st1w { z6.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z19.s }, p0, [x26]\n"
+      "st1w { z7.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 31b\n"
       "32:"  // Store to output array: Accumulator row 1 oddments
       "cbz x20, 33f\n"
       ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
       "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 33f\n"
@@ -514,7 +513,7 @@
       "cbz x21, 35f\n"
       "34:"  // Store to output array: Accumulator row 2 loop
       ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
-      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
       "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "st1w { z17.s }, p0, [x26]\n"
@@ -530,7 +529,7 @@
       "cbz x20, 36f\n"
       ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
       "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 36f\n"
@@ -550,24 +549,24 @@
       "and x20, x20, #0x3\n"
       "cbz x21, 38f\n"
       "37:"  // Store to output array: Accumulator row 3 loop
-      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
-      ".inst 0xc1b8cb34  // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
-      "st1w { z20.s }, p0, [x26]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1w { z21.s }, p0, [x26]\n"
+      "st1w { z17.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "add x12, x12, #0x4\n"
-      "st1w { z22.s }, p0, [x26]\n"
+      "st1w { z18.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z23.s }, p0, [x26]\n"
+      "st1w { z19.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 37b\n"
       "38:"  // Store to output array: Accumulator row 3 oddments
       "cbz x20, 39f\n"
       ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
       "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 39f\n"
@@ -586,10 +585,10 @@
       ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
       ".inst 0xa041c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
       ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa042c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
-      ".inst 0xa043c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -613,4 +612,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
index b8bcd53..7b3cc77 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include <cstdint>
 #include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 1, 4, 4, true> transforms = {};
 
-  cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
index 929af04..aba677b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -100,14 +99,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "1:"  // Initial accumulator load from buffer: Loop
-      ".inst 0xa040c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13]\n"
-      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xa041c5bc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
-      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
-      ".inst 0xa042c5bc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
-      ".inst 0xc0840782  // mova za2h.s[x12], { z28.s-z31.s }\n"
-      ".inst 0xa043c5a4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
-      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa040c5a4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c5a8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840501  // mova za1h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa042c5a8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840502  // mova za2h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa043c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x13, x13, #16\n"
@@ -125,11 +124,11 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      ".inst 0xa01cc299  // ldnt1w { z24.s-z27.s }, p8/Z, [x20, x28, LSL #2]\n"
-      ".inst 0xc0902700  // addha za0.s, p1/M, p1/M, z24.s\n"
-      ".inst 0xc0902721  // addha za1.s, p1/M, p1/M, z25.s\n"
-      ".inst 0xc0902742  // addha za2.s, p1/M, p1/M, z26.s\n"
-      ".inst 0xc0902763  // addha za3.s, p1/M, p1/M, z27.s\n"
+      ".inst 0xa11cc289  // ldnt1w { z1.s, z5.s, z9.s, z13.s }, p8/Z, [x20, x28, LSL #2]\n"
+      ".inst 0xc0902420  // addha za0.s, p1/M, p1/M, z1.s\n"
+      ".inst 0xc09024a1  // addha za1.s, p1/M, p1/M, z5.s\n"
+      ".inst 0xc0902522  // addha za2.s, p1/M, p1/M, z9.s\n"
+      ".inst 0xc09025a3  // addha za3.s, p1/M, p1/M, z13.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x28\n"
       "mov x21, x9\n"
@@ -152,107 +151,107 @@
       "madd x23, x28, x20, x23\n"  // bptr = B + n * kstride_bytes
       "cbz x22, 8f\n"
       "subs x22, x22, #0x1\n"
-      "ld1b { z10.b }, p1/Z, [x25]\n"
-      ".inst 0xa04086fd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23]\n"
-      "ld1b { z16.b }, p1/Z, [x25, #1, MUL VL]\n"
-      ".inst 0xa04186ed  // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      "ld1b { z21.b }, p1/Z, [x25, #2, MUL VL]\n"
-      ".inst 0xa04286f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
-      "ld1b { z19.b }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1b { z20.b }, p1/Z, [x25]\n"
+      ".inst 0xa04086e5  // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+      "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+      ".inst 0xa04186f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+      ".inst 0xa04286fd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
       "addvl x25, x25, #4\n"
-      ".inst 0xa04386e1  // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      ".inst 0xa04386f1  // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
       "addvl x23, x23, #16\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0xa09c2540  // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+      ".inst 0xa0842680  // smopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa09d2541  // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
-      ".inst 0xa09e2542  // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
-      ".inst 0xa09f2543  // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
-      "ld1b { z10.b }, p1/Z, [x25]\n"
-      ".inst 0xa08c2600  // smopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
-      ".inst 0xa04086fd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23]\n"
-      ".inst 0xa08d2601  // smopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
-      ".inst 0xa08e2602  // smopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
-      ".inst 0xa08f2603  // smopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
-      "ld1b { z16.b }, p1/Z, [x25, #1, MUL VL]\n"
-      ".inst 0xa09826a0  // smopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
-      ".inst 0xa04186ed  // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      ".inst 0xa09926a1  // smopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
-      ".inst 0xa09a26a2  // smopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
-      ".inst 0xa09b26a3  // smopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
-      "ld1b { z21.b }, p1/Z, [x25, #2, MUL VL]\n"
-      ".inst 0xa04286f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
-      ".inst 0xa0802660  // smopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
-      ".inst 0xa0812661  // smopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
-      ".inst 0xa0822662  // smopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
-      ".inst 0xa0832663  // smopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
-      "ld1b { z19.b }, p1/Z, [x25, #3, MUL VL]\n"
+      ".inst 0xa0852681  // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+      ".inst 0xa0862682  // smopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+      ".inst 0xa0872683  // smopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+      "ld1b { z20.b }, p1/Z, [x25]\n"
+      ".inst 0xa0982560  // smopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+      ".inst 0xa04086e5  // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa0992561  // smopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa09a2562  // smopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+      ".inst 0xa09b2563  // smopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+      "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+      ".inst 0xa09c2440  // smopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+      ".inst 0xa04186f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa09d2441  // smopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+      ".inst 0xa09e2442  // smopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+      ".inst 0xa09f2443  // smopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+      "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+      ".inst 0xa04286fd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xa09025c0  // smopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+      ".inst 0xa09125c1  // smopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+      ".inst 0xa09225c2  // smopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+      ".inst 0xa09325c3  // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+      "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
       "addvl x25, x25, #4\n"
-      ".inst 0xa04386e1  // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      ".inst 0xa04386f1  // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
       "addvl x23, x23, #16\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0xa09c2540  // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
-      ".inst 0xa09d2541  // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
-      ".inst 0xa09e2542  // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
-      ".inst 0xa09f2543  // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
-      ".inst 0xa08c2600  // smopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
-      ".inst 0xa08d2601  // smopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
-      ".inst 0xa08e2602  // smopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
-      ".inst 0xa08f2603  // smopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
-      ".inst 0xa09826a0  // smopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
-      ".inst 0xa09926a1  // smopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
-      ".inst 0xa09a26a2  // smopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
-      ".inst 0xa09b26a3  // smopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
-      ".inst 0xa0802660  // smopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
-      ".inst 0xa0812661  // smopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
-      ".inst 0xa0822662  // smopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
-      ".inst 0xa0832663  // smopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
+      ".inst 0xa0842680  // smopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
+      ".inst 0xa0852681  // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+      ".inst 0xa0862682  // smopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+      ".inst 0xa0872683  // smopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+      ".inst 0xa0982560  // smopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+      ".inst 0xa0992561  // smopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa09a2562  // smopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+      ".inst 0xa09b2563  // smopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+      ".inst 0xa09c2440  // smopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+      ".inst 0xa09d2441  // smopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+      ".inst 0xa09e2442  // smopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+      ".inst 0xa09f2443  // smopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+      ".inst 0xa09025c0  // smopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+      ".inst 0xa09125c1  // smopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+      ".inst 0xa09225c2  // smopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+      ".inst 0xa09325c3  // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
       "8:"  // K oddments
       "cbz x21, 10f\n"
       "9:"  // K oddments: Loop
-      "ld1b { z10.b }, p1/Z, [x25]\n"
+      "ld1b { z16.b }, p1/Z, [x25]\n"
       "subs x21, x21, #0x1\n"
       "addvl x25, x25, #1\n"
-      ".inst 0xa04086fc  // ld1b { z28.b-z31.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa04086e4  // ld1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
       "addvl x23, x23, #4\n"
-      ".inst 0xa09c2540  // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
-      ".inst 0xa09d2541  // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
-      ".inst 0xa09e2542  // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
-      ".inst 0xa09f2543  // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+      ".inst 0xa0842600  // smopa za0.s, p1/M, p1/M, z16.b, z4.b\n"
+      ".inst 0xa0852601  // smopa za1.s, p1/M, p1/M, z16.b, z5.b\n"
+      ".inst 0xa0862602  // smopa za2.s, p1/M, p1/M, z16.b, z6.b\n"
+      ".inst 0xa0872603  // smopa za3.s, p1/M, p1/M, z16.b, z7.b\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
-      "ld1w { z14.s }, p1/Z, [x25]\n"
+      "ld1w { z15.s }, p1/Z, [x25]\n"
       "addvl x25, x25, #1\n"
-      ".inst 0xc09125c0  // addva za0.s, p1/M, p1/M, z14.s\n"
-      ".inst 0xc09125c1  // addva za1.s, p1/M, p1/M, z14.s\n"
-      ".inst 0xc09125c2  // addva za2.s, p1/M, p1/M, z14.s\n"
-      ".inst 0xc09125c3  // addva za3.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125e0  // addva za0.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e1  // addva za1.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e2  // addva za2.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e3  // addva za3.s, p1/M, p1/M, z15.s\n"
       "tbz x14, #1, 14f\n"
       "tbz x14, #0, 12f\n"
       "mov x12, #0x0\n"
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
-      ".inst 0xa040c5b8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x13]\n"
-      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
-      ".inst 0xc0840700  // mova za0h.s[x12], { z24.s-z27.s }\n"
-      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
-      ".inst 0xa041c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
-      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      ".inst 0xa042c5a4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
-      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
-      ".inst 0xa043c5a4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
-      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa040c5a0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
+      ".inst 0xc0840400  // mova za0h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5a0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa060c57c  // st1w { z28.s-z31.s }, pn9.b, [x11]\n"
+      ".inst 0xa060c578  // st1w { z24.s-z27.s }, pn9.b, [x11]\n"
       "addvl x13, x13, #16\n"
-      ".inst 0xa061c568  // st1w { z8.s-z11.s }, pn9.b, [x11, #0x4, MUL VL]\n"
-      ".inst 0xa062c578  // st1w { z24.s-z27.s }, pn9.b, [x11, #0x8, MUL VL]\n"
-      ".inst 0xa063c56c  // st1w { z12.s-z15.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+      ".inst 0xa061c564  // st1w { z4.s-z7.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+      ".inst 0xa062c574  // st1w { z20.s-z23.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa063c560  // st1w { z0.s-z3.s }, pn9.b, [x11, #0xc, MUL VL]\n"
       "addvl x11, x11, #16\n"
       "blt 11b\n"
       "b 21f\n"
@@ -260,16 +259,16 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
-      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
-      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
-      ".inst 0xa060c57c  // st1w { z28.s-z31.s }, pn9.b, [x11]\n"
-      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
-      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
-      ".inst 0xa061c560  // st1w { z0.s-z3.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xa060c564  // st1w { z4.s-z7.s }, pn9.b, [x11]\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
+      ".inst 0xa061c574  // st1w { z20.s-z23.s }, pn9.b, [x11, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa062c568  // st1w { z8.s-z11.s }, pn9.b, [x11, #0x8, MUL VL]\n"
-      ".inst 0xa063c570  // st1w { z16.s-z19.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+      ".inst 0xa062c56c  // st1w { z12.s-z15.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa063c568  // st1w { z8.s-z11.s }, pn9.b, [x11, #0xc, MUL VL]\n"
       "addvl x11, x11, #16\n"
       "blt 13b\n"
       "b 21f\n"
@@ -277,17 +276,17 @@
       "ldr x24, [%x[args], %[offsetof_C]]\n"
       "add x24, x24, x28\n"  // C += n
       "sub x23, x10, x9\n"
-      "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
       "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
       "madd x24, x9, x22, x24\n"  // C += m * ldc
-      "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
-      "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
-      "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
-      "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
-      "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
-      "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
-      "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
-      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
       "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
       "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
       "tbz x14, #2, 15f\n"
@@ -295,10 +294,10 @@
       "add x21, x21, x28\n"
       "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
       "add x20, x20, x21, LSL #2\n"
-      ".inst 0xa040c28c  // ld1w { z12.s-z15.s }, p8/Z, [x20]\n"
+      ".inst 0xa040c284  // ld1w { z4.s-z7.s }, p8/Z, [x20]\n"
       "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
       "add x20, x20, x21, LSL #2\n"
-      ".inst 0xa040c284  // ld1w { z4.s-z7.s }, p8/Z, [x20]\n"
+      ".inst 0xa040c28c  // ld1w { z12.s-z15.s }, p8/Z, [x20]\n"
       "15:"  // Store to output array: Load per-channel parameters: End
       "cntw x20\n"
       "whilelt p0.b, x28, x27\n"
@@ -311,22 +310,22 @@
       "16:"  // Store to output array: Accumulator row 0 loop
       ".inst 0xc086001a  // mova { z26.s-z27.s }, za0h.s[x12, 0:1]\n"
       ".inst 0xc086005c  // mova { z28.s-z29.s }, za1h.s[x12, 0:1]\n"
-      ".inst 0xc1aca41a  // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+      ".inst 0xc1a4a41a  // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
       ".inst 0xc0860096  // mova { z22.s-z23.s }, za2h.s[x12, 0:1]\n"
       ".inst 0xc08600d0  // mova { z16.s-z17.s }, za3h.s[x12, 0:1]\n"
-      ".inst 0xc1ada41c  // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
-      ".inst 0xc1aea416  // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+      ".inst 0xc1a5a41c  // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
+      ".inst 0xc1a6a416  // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
       "add x12, x12, #0x2\n"
       "cmp x12, x21, LSL #1\n"
-      ".inst 0xc1afa410  // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
-      ".inst 0xc1a4a23a  // srshl { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
-      ".inst 0xc1a5a23c  // srshl { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
-      ".inst 0xc1a6a236  // srshl { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
-      ".inst 0xc1a7a230  // srshl { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
-      ".inst 0xc1a1a31a  // add { z26.s-z27.s }, { z26.s-z27.s }, z1.s\n"
-      ".inst 0xc1a1a31c  // add { z28.s-z29.s }, { z28.s-z29.s }, z1.s\n"
-      ".inst 0xc1a1a316  // add { z22.s-z23.s }, { z22.s-z23.s }, z1.s\n"
-      ".inst 0xc1a1a310  // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
+      ".inst 0xc1a7a410  // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
+      ".inst 0xc1aca23a  // srshl { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+      ".inst 0xc1ada23c  // srshl { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
+      ".inst 0xc1aea236  // srshl { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+      ".inst 0xc1afa230  // srshl { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
+      ".inst 0xc1a0a31a  // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+      ".inst 0xc1a0a31c  // add { z28.s-z29.s }, { z28.s-z29.s }, z0.s\n"
+      ".inst 0xc1a0a316  // add { z22.s-z23.s }, { z22.s-z23.s }, z0.s\n"
+      ".inst 0xc1a0a310  // add { z16.s-z17.s }, { z16.s-z17.s }, z0.s\n"
       ".inst 0xc1b4c6ba  // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
       ".inst 0xc1b4c6bc  // sclamp { z28.s-z29.s }, z21.s, z20.s\n"
       "uzp1 z19.b, z26.b, z28.b\n"
@@ -344,29 +343,29 @@
       "blt 16b\n"
       "17:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 18f\n"
-      ".inst 0xc0860002  // mova { z2.s-z3.s }, za0h.s[x12, 0:1]\n"
+      ".inst 0xc086000a  // mova { z10.s-z11.s }, za0h.s[x12, 0:1]\n"
       ".inst 0xc0860058  // mova { z24.s-z25.s }, za1h.s[x12, 0:1]\n"
-      ".inst 0xc1aca402  // sqdmulh { z2.s-z3.s }, { z2.s-z3.s }, z12.s\n"
-      ".inst 0xc0860090  // mova { z16.s-z17.s }, za2h.s[x12, 0:1]\n"
-      ".inst 0xc08600ca  // mova { z10.s-z11.s }, za3h.s[x12, 0:1]\n"
-      ".inst 0xc1ada418  // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
-      ".inst 0xc1aea410  // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z14.s\n"
-      ".inst 0xc1afa40a  // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z15.s\n"
-      ".inst 0xc1a4a222  // srshl { z2.s-z3.s }, { z2.s-z3.s }, z4.s\n"
-      ".inst 0xc1a5a238  // srshl { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
-      ".inst 0xc1a6a230  // srshl { z16.s-z17.s }, { z16.s-z17.s }, z6.s\n"
-      ".inst 0xc1a7a22a  // srshl { z10.s-z11.s }, { z10.s-z11.s }, z7.s\n"
-      ".inst 0xc1a1a302  // add { z2.s-z3.s }, { z2.s-z3.s }, z1.s\n"
-      ".inst 0xc1a1a318  // add { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
-      ".inst 0xc1a1a310  // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
-      ".inst 0xc1a1a30a  // add { z10.s-z11.s }, { z10.s-z11.s }, z1.s\n"
-      ".inst 0xc1b4c6a2  // sclamp { z2.s-z3.s }, z21.s, z20.s\n"
-      ".inst 0xc1b4c6b8  // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
-      "uzp1 z23.b, z2.b, z24.b\n"
-      ".inst 0xc1b4c6b0  // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+      ".inst 0xc1a4a40a  // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z4.s\n"
+      ".inst 0xc086009a  // mova { z26.s-z27.s }, za2h.s[x12, 0:1]\n"
+      ".inst 0xc08600de  // mova { z30.s-z31.s }, za3h.s[x12, 0:1]\n"
+      ".inst 0xc1a5a418  // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
+      ".inst 0xc1a6a41a  // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z6.s\n"
+      ".inst 0xc1a7a41e  // sqdmulh { z30.s-z31.s }, { z30.s-z31.s }, z7.s\n"
+      ".inst 0xc1aca22a  // srshl { z10.s-z11.s }, { z10.s-z11.s }, z12.s\n"
+      ".inst 0xc1ada238  // srshl { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
+      ".inst 0xc1aea23a  // srshl { z26.s-z27.s }, { z26.s-z27.s }, z14.s\n"
+      ".inst 0xc1afa23e  // srshl { z30.s-z31.s }, { z30.s-z31.s }, z15.s\n"
+      ".inst 0xc1a0a30a  // add { z10.s-z11.s }, { z10.s-z11.s }, z0.s\n"
+      ".inst 0xc1a0a318  // add { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
+      ".inst 0xc1a0a31a  // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+      ".inst 0xc1a0a31e  // add { z30.s-z31.s }, { z30.s-z31.s }, z0.s\n"
       ".inst 0xc1b4c6aa  // sclamp { z10.s-z11.s }, z21.s, z20.s\n"
-      "uzp1 z16.b, z16.b, z10.b\n"
-      "uzp1 z16.b, z23.b, z16.b\n"
+      ".inst 0xc1b4c6b8  // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
+      "uzp1 z17.b, z10.b, z24.b\n"
+      ".inst 0xc1b4c6ba  // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6be  // sclamp { z30.s-z31.s }, z21.s, z20.s\n"
+      "uzp1 z16.b, z26.b, z30.b\n"
+      "uzp1 z16.b, z17.b, z16.b\n"
       "st1b { z16.b }, p0, [x24]\n"
       "18:"  // Store to output array: Accumulator row 0 oddments: End
       "19:"  // Store to output array: End
@@ -374,14 +373,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "20:"  // Store to output array: Refill accumulators: Loop
-      ".inst 0xa040c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13]\n"
-      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa041c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
-      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xa042c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
-      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa043c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
-      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa040c5bc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c5a0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840403  // mova za3h.s[x12], { z0.s-z3.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x13, x13, #16\n"
@@ -405,4 +404,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
index 954b0da..79990f7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include <cstdint>
 #include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 2, 2, 4, true> transforms = {};
 
-  cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
index 0b64281..7033de5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -100,14 +99,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "1:"  // Initial accumulator load from buffer: Loop
-      ".inst 0xa040c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15]\n"
-      ".inst 0xc0840400  // mova za0h.s[x12], { z0.s-z3.s }\n"
-      ".inst 0xa041c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xa042c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
-      ".inst 0xa043c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa040c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa041c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -125,11 +124,11 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      ".inst 0xa00a4295  // ldnt1w { z20.s-z21.s }, p8/Z, [x20, x10, LSL #2]\n"
-      ".inst 0xc0902680  // addha za0.s, p1/M, p1/M, z20.s\n"
-      ".inst 0xc09026a1  // addha za1.s, p1/M, p1/M, z21.s\n"
-      ".inst 0xc0902682  // addha za2.s, p1/M, p1/M, z20.s\n"
-      ".inst 0xc09026a3  // addha za3.s, p1/M, p1/M, z21.s\n"
+      ".inst 0xa00a4299  // ldnt1w { z24.s-z25.s }, p8/Z, [x20, x10, LSL #2]\n"
+      ".inst 0xc0902700  // addha za0.s, p1/M, p1/M, z24.s\n"
+      ".inst 0xc0902721  // addha za1.s, p1/M, p1/M, z25.s\n"
+      ".inst 0xc0902702  // addha za2.s, p1/M, p1/M, z24.s\n"
+      ".inst 0xc0902723  // addha za3.s, p1/M, p1/M, z25.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x10\n"
       "mov x21, x11\n"
@@ -152,75 +151,75 @@
       "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
       "cbz x22, 8f\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa040077e  // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n"
-      ".inst 0xa04006f1  // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
-      ".inst 0xa041076e  // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
-      ".inst 0xa04106e9  // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
-      ".inst 0xa0420760  // ld1b { z0.b-z1.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0xa14206fc  // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      ".inst 0xa0430764  // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      ".inst 0xa1400763  // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa14006f9  // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa1410774  // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa04106f7  // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa1420775  // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa14206f8  // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa1430765  // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
       "addvl x27, x27, #8\n"
-      ".inst 0xa14306ea  // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      ".inst 0xa14306ef  // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
       "addvl x23, x23, #8\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0xa09027c0  // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+      ".inst 0xa0912460  // smopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa09127c1  // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
-      ".inst 0xa09027e2  // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
-      ".inst 0xa09127e3  // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
-      ".inst 0xa040077e  // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n"
-      ".inst 0xa08825c0  // smopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
-      ".inst 0xa04006f1  // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
-      ".inst 0xa08925c1  // smopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
-      ".inst 0xa08825e2  // smopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
-      ".inst 0xa08925e3  // smopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
-      ".inst 0xa041076e  // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
-      ".inst 0xa0942400  // smopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
-      ".inst 0xa04106e9  // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
-      ".inst 0xa09c2401  // smopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
-      ".inst 0xa0942422  // smopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
-      ".inst 0xa09c2423  // smopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
-      ".inst 0xa0420760  // ld1b { z0.b-z1.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0xa14206fc  // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      ".inst 0xa0822480  // smopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
-      ".inst 0xa08a2481  // smopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
-      ".inst 0xa08224a2  // smopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
-      ".inst 0xa08a24a3  // smopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
-      ".inst 0xa0430764  // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      ".inst 0xa0992461  // smopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+      ".inst 0xa0912562  // smopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+      ".inst 0xa0992563  // smopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa1400763  // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa0962680  // smopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+      ".inst 0xa14006f9  // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa0972681  // smopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+      ".inst 0xa0962782  // smopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+      ".inst 0xa0972783  // smopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+      ".inst 0xa1410774  // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa09026a0  // smopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+      ".inst 0xa04106f7  // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa09826a1  // smopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+      ".inst 0xa09027a2  // smopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+      ".inst 0xa09827a3  // smopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+      ".inst 0xa1420775  // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa14206f8  // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa08724a0  // smopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+      ".inst 0xa08f24a1  // smopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+      ".inst 0xa08725a2  // smopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+      ".inst 0xa08f25a3  // smopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
+      ".inst 0xa1430765  // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
       "addvl x27, x27, #8\n"
-      ".inst 0xa14306ea  // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      ".inst 0xa14306ef  // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
       "addvl x23, x23, #8\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0xa09027c0  // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
-      ".inst 0xa09127c1  // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
-      ".inst 0xa09027e2  // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
-      ".inst 0xa09127e3  // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
-      ".inst 0xa08825c0  // smopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
-      ".inst 0xa08925c1  // smopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
-      ".inst 0xa08825e2  // smopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
-      ".inst 0xa08925e3  // smopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
-      ".inst 0xa0942400  // smopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
-      ".inst 0xa09c2401  // smopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
-      ".inst 0xa0942422  // smopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
-      ".inst 0xa09c2423  // smopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
-      ".inst 0xa0822480  // smopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
-      ".inst 0xa08a2481  // smopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
-      ".inst 0xa08224a2  // smopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
-      ".inst 0xa08a24a3  // smopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
+      ".inst 0xa0912460  // smopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
+      ".inst 0xa0992461  // smopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+      ".inst 0xa0912562  // smopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+      ".inst 0xa0992563  // smopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa0962680  // smopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+      ".inst 0xa0972681  // smopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+      ".inst 0xa0962782  // smopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+      ".inst 0xa0972783  // smopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+      ".inst 0xa09026a0  // smopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+      ".inst 0xa09826a1  // smopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+      ".inst 0xa09027a2  // smopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+      ".inst 0xa09827a3  // smopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+      ".inst 0xa08724a0  // smopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+      ".inst 0xa08f24a1  // smopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+      ".inst 0xa08725a2  // smopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+      ".inst 0xa08f25a3  // smopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
       "8:"  // K oddments
       "cbz x21, 10f\n"
       "9:"  // K oddments: Loop
-      ".inst 0xa040077e  // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa1400773  // ld1b { z19.b, z27.b }, pn9.b/Z, [x27]\n"
       "subs x21, x21, #0x1\n"
       "addvl x27, x27, #2\n"
       ".inst 0xa04006f0  // ld1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
       "addvl x23, x23, #2\n"
-      ".inst 0xa09027c0  // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
-      ".inst 0xa09127c1  // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
-      ".inst 0xa09027e2  // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
-      ".inst 0xa09127e3  // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+      ".inst 0xa0902660  // smopa za0.s, p1/M, p1/M, z19.b, z16.b\n"
+      ".inst 0xa0912661  // smopa za1.s, p1/M, p1/M, z19.b, z17.b\n"
+      ".inst 0xa0902762  // smopa za2.s, p1/M, p1/M, z27.b, z16.b\n"
+      ".inst 0xa0912763  // smopa za3.s, p1/M, p1/M, z27.b, z17.b\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
       ".inst 0xa040476e  // ld1w { z14.s-z15.s }, pn9.b/Z, [x27]\n"
@@ -234,25 +233,25 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
-      ".inst 0xa040c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15]\n"
-      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
-      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
-      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
-      ".inst 0xa041c5f8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
-      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
-      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
-      ".inst 0xa042c5f8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
-      ".inst 0xa043c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa040c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa060c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14]\n"
+      ".inst 0xa060c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
       "addvl x15, x15, #16\n"
-      ".inst 0xa061c5d4  // st1w { z20.s-z23.s }, pn9.b, [x14, #0x4, MUL VL]\n"
-      ".inst 0xa062c5dc  // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa061c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 11b\n"
       "b 24f\n"
@@ -260,16 +259,16 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
-      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
-      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
-      ".inst 0xa060c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
-      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      ".inst 0xa061c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa062c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa062c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5d4  // st1w { z20.s-z23.s }, pn9.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 13b\n"
       "b 24f\n"
@@ -277,13 +276,13 @@
       "ldr x26, [%x[args], %[offsetof_C]]\n"
       "add x26, x26, x10\n"  // C += n
       "sub x25, x13, x11\n"
-      "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
       "ldr x24, [%x[args], %[offsetof_ldcb]]\n"
       "madd x26, x11, x24, x26\n"  // C += m * ldc
-      "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
-      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
-      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
-      "ld1rw { z11.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
       "ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
       "ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
       "tbz x16, #2, 15f\n"
@@ -291,10 +290,10 @@
       "add x21, x21, x10\n"
       "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
       "add x20, x20, x21, LSL #2\n"
-      ".inst 0xa0404282  // ld1w { z2.s-z3.s }, p8/Z, [x20]\n"
+      ".inst 0xa0404280  // ld1w { z0.s-z1.s }, p8/Z, [x20]\n"
       "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
       "add x20, x20, x21, LSL #2\n"
-      ".inst 0xa0404280  // ld1w { z0.s-z1.s }, p8/Z, [x20]\n"
+      ".inst 0xa0404282  // ld1w { z2.s-z3.s }, p8/Z, [x20]\n"
       "15:"  // Store to output array: Load per-channel parameters: End
       "cntw x23\n"
       "whilelt p0.h, x10, x9\n"
@@ -305,26 +304,26 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 17f\n"
       "16:"  // Store to output array: Accumulator row 0 loop
-      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
-      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
-      ".inst 0xc1a2ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
-      ".inst 0xc1a3ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1a1ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xc1a0aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
-      ".inst 0xc1a1aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
-      ".inst 0xc1abab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
-      ".inst 0xc1abab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
-      ".inst 0xc1b8cf2c  // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
-      ".inst 0xc1b8cf3c  // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
-      "uzp1 z16.h, z12.h, z28.h\n"
+      ".inst 0xc1a2aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc1a3aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1aeab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf28  // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z4.h, z8.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "uzp1 z16.h, z13.h, z29.h\n"
-      "uzp1 z17.h, z14.h, z30.h\n"
+      "uzp1 z16.h, z5.h, z9.h\n"
+      "uzp1 z17.h, z6.h, z10.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "uzp1 z16.h, z15.h, z31.h\n"
+      "uzp1 z16.h, z7.h, z11.h\n"
       "st1b { z17.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "st1b { z16.h }, p0, [x26]\n"
@@ -332,27 +331,27 @@
       "blt 16b\n"
       "17:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 18f\n"
-      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
-      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
-      ".inst 0xc1a2ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
-      ".inst 0xc1a3ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+      ".inst 0xc1a1ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1a0aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
-      ".inst 0xc1a1aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
-      ".inst 0xc1abab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
-      ".inst 0xc1abab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
-      ".inst 0xc1b8cf3c  // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
-      ".inst 0xc1b8cf2c  // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
-      "uzp1 z16.h, z28.h, z12.h\n"
+      ".inst 0xc1a2aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+      ".inst 0xc1a3aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+      ".inst 0xc1aeab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1b8cf28  // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z8.h, z4.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 18f\n"
       "subs x20, x20, #0x1\n"
-      "uzp1 z16.h, z29.h, z13.h\n"
+      "uzp1 z16.h, z9.h, z5.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 18f\n"
-      "uzp1 z16.h, z30.h, z14.h\n"
+      "uzp1 z16.h, z10.h, z6.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "18:"  // Store to output array: Accumulator row 0 oddments: End
@@ -367,25 +366,25 @@
       "cbz x21, 20f\n"
       "19:"  // Store to output array: Accumulator row 1 loop
       ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
-      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
-      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
-      ".inst 0xc1a3ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1a1ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z1.s\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xc1a0aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
-      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
-      ".inst 0xc1abab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
-      ".inst 0xc1abab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+      ".inst 0xc1a2aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc1a3aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z3.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1aeab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z14.s\n"
       ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
-      ".inst 0xc1b8cf30  // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
-      "uzp1 z16.h, z4.h, z16.h\n"
+      ".inst 0xc1b8cf34  // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z4.h, z20.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "uzp1 z16.h, z5.h, z17.h\n"
-      "uzp1 z17.h, z6.h, z18.h\n"
+      "uzp1 z16.h, z5.h, z21.h\n"
+      "uzp1 z17.h, z6.h, z22.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "uzp1 z16.h, z7.h, z19.h\n"
+      "uzp1 z16.h, z7.h, z23.h\n"
       "st1b { z17.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "st1b { z16.h }, p0, [x26]\n"
@@ -393,27 +392,27 @@
       "blt 19b\n"
       "20:"  // Store to output array: Accumulator row 1 oddments
       "cbz x20, 21f\n"
-      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
       ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
-      ".inst 0xc1a2ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z2.s\n"
-      ".inst 0xc1a3ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+      ".inst 0xc1a0ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1a1ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1a0aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z0.s\n"
-      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
-      ".inst 0xc1abab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
-      ".inst 0xc1abab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
-      ".inst 0xc1b8cf34  // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+      ".inst 0xc1a2aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc1a3aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1aeab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z14.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
       ".inst 0xc1b8cf30  // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
-      "uzp1 z16.h, z20.h, z16.h\n"
+      "uzp1 z16.h, z4.h, z16.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 21f\n"
       "subs x20, x20, #0x1\n"
-      "uzp1 z16.h, z21.h, z17.h\n"
+      "uzp1 z16.h, z5.h, z17.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 21f\n"
-      "uzp1 z16.h, z22.h, z18.h\n"
+      "uzp1 z16.h, z6.h, z18.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "21:"  // Store to output array: Accumulator row 1 oddments: End
       "22:"  // Store to output array: End
@@ -452,4 +451,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
index 420c219..ef39cbb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include <cstdint>
 #include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 4, 1, 4, true> transforms = {};
 
-  cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
index 0d0e3da..4601f05 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -100,14 +99,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "1:"  // Initial accumulator load from buffer: Loop
-      ".inst 0xa040c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
-      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
-      ".inst 0xa041c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xa042c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
-      ".inst 0xa043c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840703  // mova za3h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa040c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa042c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840782  // mova za2h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa043c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -125,11 +124,11 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      "ldnt1w { z15.s }, p0/Z, [x20, x10, LSL #2]\n"
-      ".inst 0xc09025e0  // addha za0.s, p1/M, p1/M, z15.s\n"
-      ".inst 0xc09025e1  // addha za1.s, p1/M, p1/M, z15.s\n"
-      ".inst 0xc09025e2  // addha za2.s, p1/M, p1/M, z15.s\n"
-      ".inst 0xc09025e3  // addha za3.s, p1/M, p1/M, z15.s\n"
+      "ldnt1w { z8.s }, p0/Z, [x20, x10, LSL #2]\n"
+      ".inst 0xc0902500  // addha za0.s, p1/M, p1/M, z8.s\n"
+      ".inst 0xc0902501  // addha za1.s, p1/M, p1/M, z8.s\n"
+      ".inst 0xc0902502  // addha za2.s, p1/M, p1/M, z8.s\n"
+      ".inst 0xc0902503  // addha za3.s, p1/M, p1/M, z8.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x10\n"
       "mov x21, x11\n"
@@ -152,107 +151,107 @@
       "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
       "cbz x22, 8f\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa1408372  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
-      "ldnt1b { z0.b }, p1/Z, [x23]\n"
-      ".inst 0xa1418373  // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
-      "ldnt1b { z9.b }, p1/Z, [x23, #1, MUL VL]\n"
-      ".inst 0xa1428370  // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
-      "ldnt1b { z21.b }, p1/Z, [x23, #2, MUL VL]\n"
-      ".inst 0xa1438362  // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xa0408364  // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+      "ldnt1b { z14.b }, p1/Z, [x23]\n"
+      ".inst 0xa0418374  // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa0428378  // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa0438368  // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
       "addvl x27, x27, #16\n"
-      "ldnt1b { z12.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
       "addvl x23, x23, #4\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0xa0802640  // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+      ".inst 0xa08e2480  // smopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa08026c1  // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
-      ".inst 0xa0802742  // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
-      ".inst 0xa08027c3  // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
-      ".inst 0xa1408372  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
-      ".inst 0xa0892660  // smopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
-      "ldnt1b { z0.b }, p1/Z, [x23]\n"
-      ".inst 0xa08926e1  // smopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
-      ".inst 0xa0892762  // smopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
-      ".inst 0xa08927e3  // smopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
-      ".inst 0xa1418373  // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0xa0952600  // smopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
-      "ldnt1b { z9.b }, p1/Z, [x23, #1, MUL VL]\n"
-      ".inst 0xa0952681  // smopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
-      ".inst 0xa0952702  // smopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
-      ".inst 0xa0952783  // smopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
-      ".inst 0xa1428370  // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
-      "ldnt1b { z21.b }, p1/Z, [x23, #2, MUL VL]\n"
-      ".inst 0xa08c2440  // smopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
-      ".inst 0xa08c24c1  // smopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
-      ".inst 0xa08c2542  // smopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
-      ".inst 0xa08c25c3  // smopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
-      ".inst 0xa1438362  // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xa08e24a1  // smopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+      ".inst 0xa08e24c2  // smopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+      ".inst 0xa08e24e3  // smopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+      ".inst 0xa0408364  // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+      ".inst 0xa09f2680  // smopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+      "ldnt1b { z14.b }, p1/Z, [x23]\n"
+      ".inst 0xa09f26a1  // smopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+      ".inst 0xa09f26c2  // smopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+      ".inst 0xa09f26e3  // smopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+      ".inst 0xa0418374  // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa08d2700  // smopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+      "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa08d2721  // smopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+      ".inst 0xa08d2742  // smopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+      ".inst 0xa08d2763  // smopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+      ".inst 0xa0428378  // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa09d2500  // smopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+      ".inst 0xa09d2521  // smopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+      ".inst 0xa09d2542  // smopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+      ".inst 0xa09d2563  // smopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
+      ".inst 0xa0438368  // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
       "addvl x27, x27, #16\n"
-      "ldnt1b { z12.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
       "addvl x23, x23, #4\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0xa0802640  // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
-      ".inst 0xa08026c1  // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
-      ".inst 0xa0802742  // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
-      ".inst 0xa08027c3  // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
-      ".inst 0xa0892660  // smopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
-      ".inst 0xa08926e1  // smopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
-      ".inst 0xa0892762  // smopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
-      ".inst 0xa08927e3  // smopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
-      ".inst 0xa0952600  // smopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
-      ".inst 0xa0952681  // smopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
-      ".inst 0xa0952702  // smopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
-      ".inst 0xa0952783  // smopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
-      ".inst 0xa08c2440  // smopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
-      ".inst 0xa08c24c1  // smopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
-      ".inst 0xa08c2542  // smopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
-      ".inst 0xa08c25c3  // smopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
+      ".inst 0xa08e2480  // smopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
+      ".inst 0xa08e24a1  // smopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+      ".inst 0xa08e24c2  // smopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+      ".inst 0xa08e24e3  // smopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+      ".inst 0xa09f2680  // smopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+      ".inst 0xa09f26a1  // smopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+      ".inst 0xa09f26c2  // smopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+      ".inst 0xa09f26e3  // smopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+      ".inst 0xa08d2700  // smopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+      ".inst 0xa08d2721  // smopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+      ".inst 0xa08d2742  // smopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+      ".inst 0xa08d2763  // smopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+      ".inst 0xa09d2500  // smopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+      ".inst 0xa09d2521  // smopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+      ".inst 0xa09d2542  // smopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+      ".inst 0xa09d2563  // smopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
       "8:"  // K oddments
       "cbz x21, 10f\n"
       "9:"  // K oddments: Loop
       ".inst 0xa1408372  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
       "subs x21, x21, #0x1\n"
       "addvl x27, x27, #4\n"
-      "ld1b { z0.b }, p1/Z, [x23]\n"
+      "ld1b { z15.b }, p1/Z, [x23]\n"
       "addvl x23, x23, #1\n"
-      ".inst 0xa0802640  // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
-      ".inst 0xa08026c1  // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
-      ".inst 0xa0802742  // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
-      ".inst 0xa08027c3  // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+      ".inst 0xa08f2640  // smopa za0.s, p1/M, p1/M, z18.b, z15.b\n"
+      ".inst 0xa08f26c1  // smopa za1.s, p1/M, p1/M, z22.b, z15.b\n"
+      ".inst 0xa08f2742  // smopa za2.s, p1/M, p1/M, z26.b, z15.b\n"
+      ".inst 0xa08f27c3  // smopa za3.s, p1/M, p1/M, z30.b, z15.b\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
-      ".inst 0xa040c360  // ld1w { z0.s-z3.s }, pn8.b/Z, [x27]\n"
+      ".inst 0xa140c363  // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27]\n"
       "addvl x27, x27, #4\n"
-      ".inst 0xc0912400  // addva za0.s, p1/M, p1/M, z0.s\n"
-      ".inst 0xc0912421  // addva za1.s, p1/M, p1/M, z1.s\n"
-      ".inst 0xc0912442  // addva za2.s, p1/M, p1/M, z2.s\n"
-      ".inst 0xc0912463  // addva za3.s, p1/M, p1/M, z3.s\n"
+      ".inst 0xc0912460  // addva za0.s, p1/M, p1/M, z3.s\n"
+      ".inst 0xc09124e1  // addva za1.s, p1/M, p1/M, z7.s\n"
+      ".inst 0xc0912562  // addva za2.s, p1/M, p1/M, z11.s\n"
+      ".inst 0xc09125e3  // addva za3.s, p1/M, p1/M, z15.s\n"
       "tbz x16, #1, 14f\n"
       "tbz x16, #0, 12f\n"
       "mov x12, #0x0\n"
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
-      ".inst 0xa040c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
-      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
-      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
-      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
-      ".inst 0xa041c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840481  // mova za1h.s[x12], { z4.s-z7.s }\n"
-      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
-      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa040c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
       ".inst 0xa042c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
       ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
-      ".inst 0xa043c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa060c1d0  // st1w { z16.s-z19.s }, pn8.b, [x14]\n"
+      ".inst 0xa060c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
       "addvl x15, x15, #16\n"
-      ".inst 0xa061c1c8  // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n"
-      ".inst 0xa062c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa061c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c1d4  // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 11b\n"
       "b 30f\n"
@@ -260,16 +259,16 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
-      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
       ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
-      ".inst 0xa060c1d0  // st1w { z16.s-z19.s }, pn8.b, [x14]\n"
-      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
-      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa060c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
       ".inst 0xa061c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa062c1d4  // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c1d8  // st1w { z24.s-z27.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa062c1c4  // st1w { z4.s-z7.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 13b\n"
       "b 30f\n"
@@ -277,22 +276,22 @@
       "ldr x26, [%x[args], %[offsetof_C]]\n"
       "add x26, x26, x10\n"  // C += n
       "sub x25, x13, x11\n"
-      "ld1rw { z8.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
       "ldr x24, [%x[args], %[offsetof_ldcb]]\n"
       "madd x26, x11, x24, x26\n"  // C += m * ldc
-      "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
-      "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
-      "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
-      "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+      "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
       "tbz x16, #2, 15f\n"
       "ldr w21, [%x[args], %[offsetof_n_0]]\n"
       "add x21, x21, x10\n"
       "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
       "add x20, x20, x21, LSL #2\n"
-      "ld1w { z8.s }, p0/Z, [x20]\n"
+      "ld1w { z2.s }, p0/Z, [x20]\n"
       "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
       "add x20, x20, x21, LSL #2\n"
-      "ld1w { z7.s }, p0/Z, [x20]\n"
+      "ld1w { z1.s }, p0/Z, [x20]\n"
       "15:"  // Store to output array: Load per-channel parameters: End
       "cntw x23\n"
       "whilelt p0.s, x10, x9\n"
@@ -303,30 +302,30 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 17f\n"
       "16:"  // Store to output array: Accumulator row 0 loop
-      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
-      ".inst 0xc1a8ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc1a2ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
       "add x12, x12, #0x4\n"
-      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
-      ".inst 0xc1a4ccac  // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
-      "st1b { z12.s }, p0, [x26]\n"
+      ".inst 0xc1a0ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
+      ".inst 0xc1b4ceb0  // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1b { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z13.s }, p0, [x26]\n"
+      "st1b { z17.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z14.s }, p0, [x26]\n"
+      "st1b { z18.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z15.s }, p0, [x26]\n"
+      "st1b { z19.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 16b\n"
       "17:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 18f\n"
       ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
-      ".inst 0xc1a8ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+      ".inst 0xc1a2ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1a7aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
-      ".inst 0xc1a6ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
-      ".inst 0xc1a4ccb0  // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
+      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+      ".inst 0xc1a0ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
+      ".inst 0xc1b4ceb0  // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
       "st1b { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 18f\n"
@@ -347,38 +346,38 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 20f\n"
       "19:"  // Store to output array: Accumulator row 1 loop
-      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
-      ".inst 0xc1a8ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
       "add x12, x12, #0x4\n"
-      ".inst 0xc1a7aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+      ".inst 0xc1a1aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xc1a6ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
-      ".inst 0xc1a4ccb0  // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
-      "st1b { z16.s }, p0, [x26]\n"
+      ".inst 0xc1a0ab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1b4cea4  // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
+      "st1b { z4.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z17.s }, p0, [x26]\n"
+      "st1b { z5.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z18.s }, p0, [x26]\n"
+      "st1b { z6.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z19.s }, p0, [x26]\n"
+      "st1b { z7.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 19b\n"
       "20:"  // Store to output array: Accumulator row 1 oddments
       "cbz x20, 21f\n"
-      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
-      ".inst 0xc1a8ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1a7aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
-      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
-      ".inst 0xc1a4ccbc  // sclamp { z28.s-z31.s }, z5.s, z4.s\n"
-      "st1b { z28.s }, p0, [x26]\n"
+      ".inst 0xc1a1aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+      ".inst 0xc1a0ab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1b4cea4  // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
+      "st1b { z4.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 21f\n"
       "subs x20, x20, #0x1\n"
-      "st1b { z29.s }, p0, [x26]\n"
+      "st1b { z5.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 21f\n"
-      "st1b { z30.s }, p0, [x26]\n"
+      "st1b { z6.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "21:"  // Store to output array: Accumulator row 1 oddments: End
       "subs x25, x25, x22\n"
@@ -391,30 +390,30 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 23f\n"
       "22:"  // Store to output array: Accumulator row 2 loop
-      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
-      ".inst 0xc1a8ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc1a2ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
       "add x12, x12, #0x4\n"
-      ".inst 0xc1a7aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+      ".inst 0xc1a1aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xc1a6ab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
-      ".inst 0xc1a4ccb8  // sclamp { z24.s-z27.s }, z5.s, z4.s\n"
-      "st1b { z24.s }, p0, [x26]\n"
+      ".inst 0xc1a0ab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+      ".inst 0xc1b4cea8  // sclamp { z8.s-z11.s }, z21.s, z20.s\n"
+      "st1b { z8.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z25.s }, p0, [x26]\n"
+      "st1b { z9.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z26.s }, p0, [x26]\n"
+      "st1b { z10.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z27.s }, p0, [x26]\n"
+      "st1b { z11.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 22b\n"
       "23:"  // Store to output array: Accumulator row 2 oddments
       "cbz x20, 24f\n"
       ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
-      ".inst 0xc1a8ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+      ".inst 0xc1a2ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
-      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
-      ".inst 0xc1a4ccac  // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
+      ".inst 0xc1a1aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+      ".inst 0xc1a0ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+      ".inst 0xc1b4ceac  // sclamp { z12.s-z15.s }, z21.s, z20.s\n"
       "st1b { z12.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 24f\n"
@@ -435,52 +434,52 @@
       "and x20, x20, #0x3\n"
       "cbz x21, 26f\n"
       "25:"  // Store to output array: Accumulator row 3 loop
-      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
-      ".inst 0xc1a8ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a2ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
       "add x12, x12, #0x4\n"
-      ".inst 0xc1a7aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+      ".inst 0xc1a1aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xc1a6ab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
-      ".inst 0xc1a4ccb4  // sclamp { z20.s-z23.s }, z5.s, z4.s\n"
-      "st1b { z20.s }, p0, [x26]\n"
+      ".inst 0xc1a0ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+      ".inst 0xc1b4cebc  // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1b { z28.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z21.s }, p0, [x26]\n"
+      "st1b { z29.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z22.s }, p0, [x26]\n"
+      "st1b { z30.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z23.s }, p0, [x26]\n"
+      "st1b { z31.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 25b\n"
       "26:"  // Store to output array: Accumulator row 3 oddments
       "cbz x20, 27f\n"
-      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
-      ".inst 0xc1a8ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a2ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1a7aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z7.s\n"
-      ".inst 0xc1a6ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
-      ".inst 0xc1a4cca0  // sclamp { z0.s-z3.s }, z5.s, z4.s\n"
-      "st1b { z0.s }, p0, [x26]\n"
+      ".inst 0xc1a1aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+      ".inst 0xc1a0ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+      ".inst 0xc1b4cebc  // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1b { z28.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 27f\n"
       "subs x20, x20, #0x1\n"
-      "st1b { z1.s }, p0, [x26]\n"
+      "st1b { z29.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 27f\n"
-      "st1b { z2.s }, p0, [x26]\n"
+      "st1b { z30.s }, p0, [x26]\n"
       "27:"  // Store to output array: Accumulator row 3 oddments: End
       "28:"  // Store to output array: End
       "tbz x16, #0, 30f\n"
       "mov x12, #0x0\n"
       "cntw x20\n"
       "29:"  // Store to output array: Refill accumulators: Loop
-      ".inst 0xa040c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
-      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa040c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
       ".inst 0xa041c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
       ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
       ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
       ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa043c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa043c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840403  // mova za3h.s[x12], { z0.s-z3.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -504,4 +503,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
index c969c7a..b9d8b60 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include <cstdint>
 #include "../std_transforms_sme.hpp"
@@ -32,7 +32,7 @@
 {
 
 // Implementations
-void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
 
 class cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL
 {
@@ -40,7 +40,7 @@
   typedef int8_t operand_type;
   typedef int32_t result_type;
 
-  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
 
   /* Kernel blocking parameters */
   static unsigned int out_height()
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 1, 4, 4> transforms = {};
 
-  cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
index 12e714a..d11faa6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -32,10 +31,8 @@
 
 namespace arm_gemm {
 
-void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer)
 {
-  ARM_COMPUTE_UNUSED(act);
-
   struct KernelArgs
   {
     KernelArgs(
@@ -96,12 +93,12 @@
       "1:"  // Initial accumulator load from buffer: Loop
       ".inst 0xa040c564  // ld1w { z4.s-z7.s }, pn9.b/Z, [x11]\n"
       ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
-      ".inst 0xa041c57c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
-      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
-      ".inst 0xa042c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
-      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa043c560  // ld1w { z0.s-z3.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
-      ".inst 0xc0840403  // mova za3h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa041c56c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa042c57c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xc0840782  // mova za2h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa043c574  // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x11, x11, #16\n"
@@ -119,11 +116,11 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      ".inst 0xa11bc28a  // ldnt1w { z2.s, z6.s, z10.s, z14.s }, p8/Z, [x20, x27, LSL #2]\n"
-      ".inst 0xc0900040  // addha za0.s, p0/M, p0/M, z2.s\n"
-      ".inst 0xc09000c1  // addha za1.s, p0/M, p0/M, z6.s\n"
-      ".inst 0xc0900142  // addha za2.s, p0/M, p0/M, z10.s\n"
-      ".inst 0xc09001c3  // addha za3.s, p0/M, p0/M, z14.s\n"
+      ".inst 0xa11bc29b  // ldnt1w { z19.s, z23.s, z27.s, z31.s }, p8/Z, [x20, x27, LSL #2]\n"
+      ".inst 0xc0900260  // addha za0.s, p0/M, p0/M, z19.s\n"
+      ".inst 0xc09002e1  // addha za1.s, p0/M, p0/M, z23.s\n"
+      ".inst 0xc0900362  // addha za2.s, p0/M, p0/M, z27.s\n"
+      ".inst 0xc09003e3  // addha za3.s, p0/M, p0/M, z31.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x27\n"
       "mov x21, x28\n"
@@ -146,75 +143,75 @@
       "madd x23, x27, x20, x23\n"  // bptr = B + n * kstride_bytes
       "cbz x22, 8f\n"
       "subs x22, x22, #0x1\n"
-      "ld1b { z20.b }, p0/Z, [x24]\n"
-      ".inst 0xa14086e9  // ldnt1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x23]\n"
-      "ld1b { z10.b }, p0/Z, [x24, #1, MUL VL]\n"
-      ".inst 0xa14186fa  // ldnt1b { z18.b, z22.b, z26.b, z30.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      "ld1b { z16.b }, p0/Z, [x24, #2, MUL VL]\n"
-      ".inst 0xa14286eb  // ldnt1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
-      "ld1b { z25.b }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1b { z30.b }, p0/Z, [x24]\n"
+      ".inst 0xa04086e1  // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23]\n"
+      "ld1b { z21.b }, p0/Z, [x24, #1, MUL VL]\n"
+      ".inst 0xa04186f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      "ld1b { z28.b }, p0/Z, [x24, #2, MUL VL]\n"
+      ".inst 0xa04286e5  // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      "ld1b { z11.b }, p0/Z, [x24, #3, MUL VL]\n"
       "addvl x24, x24, #4\n"
-      ".inst 0xa14386e8  // ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      ".inst 0xa04386f1  // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
       "addvl x23, x23, #16\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0xa0810280  // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n"
+      ".inst 0xa08003c0  // smopa za0.s, p0/M, p0/M, z30.b, z0.b\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa0850281  // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n"
-      ".inst 0xa0890282  // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n"
-      ".inst 0xa08d0283  // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n"
-      "ld1b { z20.b }, p0/Z, [x24]\n"
-      ".inst 0xa0920140  // smopa za0.s, p0/M, p0/M, z10.b, z18.b\n"
-      ".inst 0xa14086e9  // ldnt1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x23]\n"
-      ".inst 0xa0960141  // smopa za1.s, p0/M, p0/M, z10.b, z22.b\n"
-      ".inst 0xa09a0142  // smopa za2.s, p0/M, p0/M, z10.b, z26.b\n"
-      ".inst 0xa09e0143  // smopa za3.s, p0/M, p0/M, z10.b, z30.b\n"
-      "ld1b { z10.b }, p0/Z, [x24, #1, MUL VL]\n"
-      ".inst 0xa0830200  // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
-      ".inst 0xa14186fa  // ldnt1b { z18.b, z22.b, z26.b, z30.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      ".inst 0xa0870201  // smopa za1.s, p0/M, p0/M, z16.b, z7.b\n"
-      ".inst 0xa08b0202  // smopa za2.s, p0/M, p0/M, z16.b, z11.b\n"
-      ".inst 0xa08f0203  // smopa za3.s, p0/M, p0/M, z16.b, z15.b\n"
-      "ld1b { z16.b }, p0/Z, [x24, #2, MUL VL]\n"
-      ".inst 0xa14286eb  // ldnt1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
-      ".inst 0xa0800320  // smopa za0.s, p0/M, p0/M, z25.b, z0.b\n"
-      ".inst 0xa0840321  // smopa za1.s, p0/M, p0/M, z25.b, z4.b\n"
-      ".inst 0xa0880322  // smopa za2.s, p0/M, p0/M, z25.b, z8.b\n"
-      ".inst 0xa08c0323  // smopa za3.s, p0/M, p0/M, z25.b, z12.b\n"
-      "ld1b { z25.b }, p0/Z, [x24, #3, MUL VL]\n"
+      ".inst 0xa08103c1  // smopa za1.s, p0/M, p0/M, z30.b, z1.b\n"
+      ".inst 0xa08203c2  // smopa za2.s, p0/M, p0/M, z30.b, z2.b\n"
+      ".inst 0xa08303c3  // smopa za3.s, p0/M, p0/M, z30.b, z3.b\n"
+      "ld1b { z30.b }, p0/Z, [x24]\n"
+      ".inst 0xa09802a0  // smopa za0.s, p0/M, p0/M, z21.b, z24.b\n"
+      ".inst 0xa04086e1  // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa09902a1  // smopa za1.s, p0/M, p0/M, z21.b, z25.b\n"
+      ".inst 0xa09a02a2  // smopa za2.s, p0/M, p0/M, z21.b, z26.b\n"
+      ".inst 0xa09b02a3  // smopa za3.s, p0/M, p0/M, z21.b, z27.b\n"
+      "ld1b { z21.b }, p0/Z, [x24, #1, MUL VL]\n"
+      ".inst 0xa0840380  // smopa za0.s, p0/M, p0/M, z28.b, z4.b\n"
+      ".inst 0xa04186f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa0850381  // smopa za1.s, p0/M, p0/M, z28.b, z5.b\n"
+      ".inst 0xa0860382  // smopa za2.s, p0/M, p0/M, z28.b, z6.b\n"
+      ".inst 0xa0870383  // smopa za3.s, p0/M, p0/M, z28.b, z7.b\n"
+      "ld1b { z28.b }, p0/Z, [x24, #2, MUL VL]\n"
+      ".inst 0xa04286e5  // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xa0900160  // smopa za0.s, p0/M, p0/M, z11.b, z16.b\n"
+      ".inst 0xa0910161  // smopa za1.s, p0/M, p0/M, z11.b, z17.b\n"
+      ".inst 0xa0920162  // smopa za2.s, p0/M, p0/M, z11.b, z18.b\n"
+      ".inst 0xa0930163  // smopa za3.s, p0/M, p0/M, z11.b, z19.b\n"
+      "ld1b { z11.b }, p0/Z, [x24, #3, MUL VL]\n"
       "addvl x24, x24, #4\n"
-      ".inst 0xa14386e8  // ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      ".inst 0xa04386f1  // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
       "addvl x23, x23, #16\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0xa0810280  // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n"
-      ".inst 0xa0850281  // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n"
-      ".inst 0xa0890282  // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n"
-      ".inst 0xa08d0283  // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n"
-      ".inst 0xa0920140  // smopa za0.s, p0/M, p0/M, z10.b, z18.b\n"
-      ".inst 0xa0960141  // smopa za1.s, p0/M, p0/M, z10.b, z22.b\n"
-      ".inst 0xa09a0142  // smopa za2.s, p0/M, p0/M, z10.b, z26.b\n"
-      ".inst 0xa09e0143  // smopa za3.s, p0/M, p0/M, z10.b, z30.b\n"
-      ".inst 0xa0830200  // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
-      ".inst 0xa0870201  // smopa za1.s, p0/M, p0/M, z16.b, z7.b\n"
-      ".inst 0xa08b0202  // smopa za2.s, p0/M, p0/M, z16.b, z11.b\n"
-      ".inst 0xa08f0203  // smopa za3.s, p0/M, p0/M, z16.b, z15.b\n"
-      ".inst 0xa0800320  // smopa za0.s, p0/M, p0/M, z25.b, z0.b\n"
-      ".inst 0xa0840321  // smopa za1.s, p0/M, p0/M, z25.b, z4.b\n"
-      ".inst 0xa0880322  // smopa za2.s, p0/M, p0/M, z25.b, z8.b\n"
-      ".inst 0xa08c0323  // smopa za3.s, p0/M, p0/M, z25.b, z12.b\n"
+      ".inst 0xa08003c0  // smopa za0.s, p0/M, p0/M, z30.b, z0.b\n"
+      ".inst 0xa08103c1  // smopa za1.s, p0/M, p0/M, z30.b, z1.b\n"
+      ".inst 0xa08203c2  // smopa za2.s, p0/M, p0/M, z30.b, z2.b\n"
+      ".inst 0xa08303c3  // smopa za3.s, p0/M, p0/M, z30.b, z3.b\n"
+      ".inst 0xa09802a0  // smopa za0.s, p0/M, p0/M, z21.b, z24.b\n"
+      ".inst 0xa09902a1  // smopa za1.s, p0/M, p0/M, z21.b, z25.b\n"
+      ".inst 0xa09a02a2  // smopa za2.s, p0/M, p0/M, z21.b, z26.b\n"
+      ".inst 0xa09b02a3  // smopa za3.s, p0/M, p0/M, z21.b, z27.b\n"
+      ".inst 0xa0840380  // smopa za0.s, p0/M, p0/M, z28.b, z4.b\n"
+      ".inst 0xa0850381  // smopa za1.s, p0/M, p0/M, z28.b, z5.b\n"
+      ".inst 0xa0860382  // smopa za2.s, p0/M, p0/M, z28.b, z6.b\n"
+      ".inst 0xa0870383  // smopa za3.s, p0/M, p0/M, z28.b, z7.b\n"
+      ".inst 0xa0900160  // smopa za0.s, p0/M, p0/M, z11.b, z16.b\n"
+      ".inst 0xa0910161  // smopa za1.s, p0/M, p0/M, z11.b, z17.b\n"
+      ".inst 0xa0920162  // smopa za2.s, p0/M, p0/M, z11.b, z18.b\n"
+      ".inst 0xa0930163  // smopa za3.s, p0/M, p0/M, z11.b, z19.b\n"
       "8:"  // K oddments
       "cbz x21, 10f\n"
       "9:"  // K oddments: Loop
-      "ld1b { z20.b }, p0/Z, [x24]\n"
+      "ld1b { z22.b }, p0/Z, [x24]\n"
       "subs x21, x21, #0x1\n"
       "addvl x24, x24, #1\n"
-      ".inst 0xa14086e1  // ld1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa14086f1  // ld1b { z17.b, z21.b, z25.b, z29.b }, pn9.b/Z, [x23]\n"
       "addvl x23, x23, #4\n"
-      ".inst 0xa0810280  // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n"
-      ".inst 0xa0850281  // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n"
-      ".inst 0xa0890282  // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n"
-      ".inst 0xa08d0283  // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n"
+      ".inst 0xa09102c0  // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
+      ".inst 0xa09502c1  // smopa za1.s, p0/M, p0/M, z22.b, z21.b\n"
+      ".inst 0xa09902c2  // smopa za2.s, p0/M, p0/M, z22.b, z25.b\n"
+      ".inst 0xa09d02c3  // smopa za3.s, p0/M, p0/M, z22.b, z29.b\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
       "tbz x13, #1, 14f\n"
@@ -222,25 +219,25 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
-      ".inst 0xa040c57c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xa040c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11]\n"
       ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
-      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
-      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
-      ".inst 0xa041c560  // ld1w { z0.s-z3.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
-      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
-      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
-      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
-      ".inst 0xa042c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
-      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa043c56c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
-      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa041c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xa042c560  // ld1w { z0.s-z3.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa043c574  // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       ".inst 0xa060c544  // st1w { z4.s-z7.s }, pn9.b, [x10]\n"
       "addvl x11, x11, #16\n"
-      ".inst 0xa061c554  // st1w { z20.s-z23.s }, pn9.b, [x10, #0x4, MUL VL]\n"
-      ".inst 0xa062c558  // st1w { z24.s-z27.s }, pn9.b, [x10, #0x8, MUL VL]\n"
-      ".inst 0xa063c55c  // st1w { z28.s-z31.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+      ".inst 0xa061c54c  // st1w { z12.s-z15.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+      ".inst 0xa062c55c  // st1w { z28.s-z31.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+      ".inst 0xa063c550  // st1w { z16.s-z19.s }, pn9.b, [x10, #0xc, MUL VL]\n"
       "addvl x10, x10, #16\n"
       "blt 11b\n"
       "b 20f\n"
@@ -248,16 +245,16 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
-      ".inst 0xc0860414  // mova { z20.s-z23.s }, za0h.s[x12]\n"
-      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
-      ".inst 0xa060c554  // st1w { z20.s-z23.s }, pn9.b, [x10]\n"
-      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      ".inst 0xa061c540  // st1w { z0.s-z3.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa060c54c  // st1w { z12.s-z15.s }, pn9.b, [x10]\n"
+      ".inst 0xc0860440  // mova { z0.s-z3.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa061c544  // st1w { z4.s-z7.s }, pn9.b, [x10, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa062c548  // st1w { z8.s-z11.s }, pn9.b, [x10, #0x8, MUL VL]\n"
-      ".inst 0xa063c54c  // st1w { z12.s-z15.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+      ".inst 0xa062c540  // st1w { z0.s-z3.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+      ".inst 0xa063c558  // st1w { z24.s-z27.s }, pn9.b, [x10, #0xc, MUL VL]\n"
       "addvl x10, x10, #16\n"
       "blt 13b\n"
       "b 20f\n"
@@ -293,32 +290,32 @@
       "16:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 17f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
-      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
-      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      ".inst 0xa160c2e0  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x23]\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa160c2f0  // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x23]\n"
       "add x23, x23, x22\n"
       "beq 17f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa160c2e1  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x23]\n"
+      ".inst 0xa160c2f1  // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x23]\n"
       "add x23, x23, x22\n"
       "beq 17f\n"
-      ".inst 0xa160c2e2  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x23]\n"
+      ".inst 0xa160c2f2  // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x23]\n"
       "17:"  // Store to output array: Accumulator row 0 oddments: End
       "18:"  // Store to output array: End
       "tbz x13, #0, 20f\n"
       "mov x12, #0x0\n"
       "cntw x20\n"
       "19:"  // Store to output array: Refill accumulators: Loop
-      ".inst 0xa040c56c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x11]\n"
-      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa040c568  // ld1w { z8.s-z11.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
       ".inst 0xa041c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
       ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
       ".inst 0xa042c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
       ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa043c564  // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
-      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa043c568  // ld1w { z8.s-z11.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x11, x11, #16\n"
@@ -342,4 +339,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
index a0705e5..f05d2cf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include <cstdint>
 #include "../std_transforms_sme.hpp"
@@ -32,7 +32,7 @@
 {
 
 // Implementations
-void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
 
 class cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL
 {
@@ -40,7 +40,7 @@
   typedef int8_t operand_type;
   typedef int32_t result_type;
 
-  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
 
   /* Kernel blocking parameters */
   static unsigned int out_height()
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 2, 2, 4> transforms = {};
 
-  cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
index d7a7528..47de894 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -32,10 +31,8 @@
 
 namespace arm_gemm {
 
-void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer)
 {
-  ARM_COMPUTE_UNUSED(act);
-
   struct KernelArgs
   {
     KernelArgs(
@@ -96,12 +93,12 @@
       "1:"  // Initial accumulator load from buffer: Loop
       ".inst 0xa040c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
       ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
-      ".inst 0xa041c5f8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
-      ".inst 0xa042c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840782  // mova za2h.s[x12], { z28.s-z31.s }\n"
-      ".inst 0xa043c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa042c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840403  // mova za3h.s[x12], { z0.s-z3.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -119,11 +116,11 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      ".inst 0xa10a429c  // ldnt1w { z20.s, z28.s }, p8/Z, [x20, x10, LSL #2]\n"
+      ".inst 0xa00a4295  // ldnt1w { z20.s-z21.s }, p8/Z, [x20, x10, LSL #2]\n"
       ".inst 0xc0900280  // addha za0.s, p0/M, p0/M, z20.s\n"
-      ".inst 0xc0900381  // addha za1.s, p0/M, p0/M, z28.s\n"
+      ".inst 0xc09002a1  // addha za1.s, p0/M, p0/M, z21.s\n"
       ".inst 0xc0900282  // addha za2.s, p0/M, p0/M, z20.s\n"
-      ".inst 0xc0900383  // addha za3.s, p0/M, p0/M, z28.s\n"
+      ".inst 0xc09002a3  // addha za3.s, p0/M, p0/M, z21.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x10\n"
       "mov x21, x11\n"
@@ -146,75 +143,75 @@
       "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
       "cbz x22, 8f\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa1400776  // ld1b { z22.b, z30.b }, pn9.b/Z, [x27]\n"
-      ".inst 0xa14006f9  // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
-      ".inst 0xa1410770  // ld1b { z16.b, z24.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
-      ".inst 0xa14106eb  // ldnt1b { z3.b, z11.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
-      ".inst 0xa0420768  // ld1b { z8.b-z9.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0xa04206f3  // ldnt1b { z18.b-z19.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa040077c  // ld1b { z28.b-z29.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa14006e8  // ldnt1b { z0.b, z8.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa0410762  // ld1b { z2.b-z3.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa14106ff  // ldnt1b { z23.b, z31.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa042076e  // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa14206f8  // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
       ".inst 0xa0430764  // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
       "addvl x27, x27, #8\n"
-      ".inst 0xa14306fd  // ldnt1b { z21.b, z29.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      ".inst 0xa04306f5  // ldnt1b { z20.b-z21.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
       "addvl x23, x23, #8\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0xa09102c0  // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
+      ".inst 0xa0800380  // smopa za0.s, p0/M, p0/M, z28.b, z0.b\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa09902c1  // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n"
-      ".inst 0xa09103c2  // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n"
-      ".inst 0xa09903c3  // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n"
-      ".inst 0xa1400776  // ld1b { z22.b, z30.b }, pn9.b/Z, [x27]\n"
-      ".inst 0xa0830200  // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
-      ".inst 0xa14006f9  // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
-      ".inst 0xa08b0201  // smopa za1.s, p0/M, p0/M, z16.b, z11.b\n"
-      ".inst 0xa0830302  // smopa za2.s, p0/M, p0/M, z24.b, z3.b\n"
-      ".inst 0xa08b0303  // smopa za3.s, p0/M, p0/M, z24.b, z11.b\n"
-      ".inst 0xa1410770  // ld1b { z16.b, z24.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
-      ".inst 0xa0920100  // smopa za0.s, p0/M, p0/M, z8.b, z18.b\n"
-      ".inst 0xa14106eb  // ldnt1b { z3.b, z11.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
-      ".inst 0xa0930101  // smopa za1.s, p0/M, p0/M, z8.b, z19.b\n"
-      ".inst 0xa0920122  // smopa za2.s, p0/M, p0/M, z9.b, z18.b\n"
-      ".inst 0xa0930123  // smopa za3.s, p0/M, p0/M, z9.b, z19.b\n"
-      ".inst 0xa0420768  // ld1b { z8.b-z9.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0xa04206f3  // ldnt1b { z18.b-z19.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      ".inst 0xa0950080  // smopa za0.s, p0/M, p0/M, z4.b, z21.b\n"
-      ".inst 0xa09d0081  // smopa za1.s, p0/M, p0/M, z4.b, z29.b\n"
-      ".inst 0xa09500a2  // smopa za2.s, p0/M, p0/M, z5.b, z21.b\n"
-      ".inst 0xa09d00a3  // smopa za3.s, p0/M, p0/M, z5.b, z29.b\n"
+      ".inst 0xa0880381  // smopa za1.s, p0/M, p0/M, z28.b, z8.b\n"
+      ".inst 0xa08003a2  // smopa za2.s, p0/M, p0/M, z29.b, z0.b\n"
+      ".inst 0xa08803a3  // smopa za3.s, p0/M, p0/M, z29.b, z8.b\n"
+      ".inst 0xa040077c  // ld1b { z28.b-z29.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa0970040  // smopa za0.s, p0/M, p0/M, z2.b, z23.b\n"
+      ".inst 0xa14006e8  // ldnt1b { z0.b, z8.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa09f0041  // smopa za1.s, p0/M, p0/M, z2.b, z31.b\n"
+      ".inst 0xa0970062  // smopa za2.s, p0/M, p0/M, z3.b, z23.b\n"
+      ".inst 0xa09f0063  // smopa za3.s, p0/M, p0/M, z3.b, z31.b\n"
+      ".inst 0xa0410762  // ld1b { z2.b-z3.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa09001c0  // smopa za0.s, p0/M, p0/M, z14.b, z16.b\n"
+      ".inst 0xa14106ff  // ldnt1b { z23.b, z31.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa09801c1  // smopa za1.s, p0/M, p0/M, z14.b, z24.b\n"
+      ".inst 0xa09001e2  // smopa za2.s, p0/M, p0/M, z15.b, z16.b\n"
+      ".inst 0xa09801e3  // smopa za3.s, p0/M, p0/M, z15.b, z24.b\n"
+      ".inst 0xa042076e  // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa14206f8  // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa0940080  // smopa za0.s, p0/M, p0/M, z4.b, z20.b\n"
+      ".inst 0xa0950081  // smopa za1.s, p0/M, p0/M, z4.b, z21.b\n"
+      ".inst 0xa09400a2  // smopa za2.s, p0/M, p0/M, z5.b, z20.b\n"
+      ".inst 0xa09500a3  // smopa za3.s, p0/M, p0/M, z5.b, z21.b\n"
       ".inst 0xa0430764  // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
       "addvl x27, x27, #8\n"
-      ".inst 0xa14306fd  // ldnt1b { z21.b, z29.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      ".inst 0xa04306f5  // ldnt1b { z20.b-z21.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
       "addvl x23, x23, #8\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0xa09102c0  // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
-      ".inst 0xa09902c1  // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n"
-      ".inst 0xa09103c2  // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n"
-      ".inst 0xa09903c3  // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n"
-      ".inst 0xa0830200  // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
-      ".inst 0xa08b0201  // smopa za1.s, p0/M, p0/M, z16.b, z11.b\n"
-      ".inst 0xa0830302  // smopa za2.s, p0/M, p0/M, z24.b, z3.b\n"
-      ".inst 0xa08b0303  // smopa za3.s, p0/M, p0/M, z24.b, z11.b\n"
-      ".inst 0xa0920100  // smopa za0.s, p0/M, p0/M, z8.b, z18.b\n"
-      ".inst 0xa0930101  // smopa za1.s, p0/M, p0/M, z8.b, z19.b\n"
-      ".inst 0xa0920122  // smopa za2.s, p0/M, p0/M, z9.b, z18.b\n"
-      ".inst 0xa0930123  // smopa za3.s, p0/M, p0/M, z9.b, z19.b\n"
-      ".inst 0xa0950080  // smopa za0.s, p0/M, p0/M, z4.b, z21.b\n"
-      ".inst 0xa09d0081  // smopa za1.s, p0/M, p0/M, z4.b, z29.b\n"
-      ".inst 0xa09500a2  // smopa za2.s, p0/M, p0/M, z5.b, z21.b\n"
-      ".inst 0xa09d00a3  // smopa za3.s, p0/M, p0/M, z5.b, z29.b\n"
+      ".inst 0xa0800380  // smopa za0.s, p0/M, p0/M, z28.b, z0.b\n"
+      ".inst 0xa0880381  // smopa za1.s, p0/M, p0/M, z28.b, z8.b\n"
+      ".inst 0xa08003a2  // smopa za2.s, p0/M, p0/M, z29.b, z0.b\n"
+      ".inst 0xa08803a3  // smopa za3.s, p0/M, p0/M, z29.b, z8.b\n"
+      ".inst 0xa0970040  // smopa za0.s, p0/M, p0/M, z2.b, z23.b\n"
+      ".inst 0xa09f0041  // smopa za1.s, p0/M, p0/M, z2.b, z31.b\n"
+      ".inst 0xa0970062  // smopa za2.s, p0/M, p0/M, z3.b, z23.b\n"
+      ".inst 0xa09f0063  // smopa za3.s, p0/M, p0/M, z3.b, z31.b\n"
+      ".inst 0xa09001c0  // smopa za0.s, p0/M, p0/M, z14.b, z16.b\n"
+      ".inst 0xa09801c1  // smopa za1.s, p0/M, p0/M, z14.b, z24.b\n"
+      ".inst 0xa09001e2  // smopa za2.s, p0/M, p0/M, z15.b, z16.b\n"
+      ".inst 0xa09801e3  // smopa za3.s, p0/M, p0/M, z15.b, z24.b\n"
+      ".inst 0xa0940080  // smopa za0.s, p0/M, p0/M, z4.b, z20.b\n"
+      ".inst 0xa0950081  // smopa za1.s, p0/M, p0/M, z4.b, z21.b\n"
+      ".inst 0xa09400a2  // smopa za2.s, p0/M, p0/M, z5.b, z20.b\n"
+      ".inst 0xa09500a3  // smopa za3.s, p0/M, p0/M, z5.b, z21.b\n"
       "8:"  // K oddments
       "cbz x21, 10f\n"
       "9:"  // K oddments: Loop
-      ".inst 0xa1400776  // ld1b { z22.b, z30.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa1400774  // ld1b { z20.b, z28.b }, pn9.b/Z, [x27]\n"
       "subs x21, x21, #0x1\n"
       "addvl x27, x27, #2\n"
-      ".inst 0xa14006f1  // ld1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa14006e7  // ld1b { z7.b, z15.b }, pn9.b/Z, [x23]\n"
       "addvl x23, x23, #2\n"
-      ".inst 0xa09102c0  // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
-      ".inst 0xa09902c1  // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n"
-      ".inst 0xa09103c2  // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n"
-      ".inst 0xa09903c3  // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n"
+      ".inst 0xa0870280  // smopa za0.s, p0/M, p0/M, z20.b, z7.b\n"
+      ".inst 0xa08f0281  // smopa za1.s, p0/M, p0/M, z20.b, z15.b\n"
+      ".inst 0xa0870382  // smopa za2.s, p0/M, p0/M, z28.b, z7.b\n"
+      ".inst 0xa08f0383  // smopa za3.s, p0/M, p0/M, z28.b, z15.b\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
       "tbz x16, #1, 14f\n"
@@ -223,24 +220,24 @@
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
       ".inst 0xa040c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
-      ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
       ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
       ".inst 0xa041c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
       ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
-      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
-      ".inst 0xa042c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xa043c5e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5f8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa043c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa060c5d8  // st1w { z24.s-z27.s }, pn9.b, [x14]\n"
+      ".inst 0xa060c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14]\n"
       "addvl x15, x15, #16\n"
-      ".inst 0xa061c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n"
-      ".inst 0xa062c5dc  // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa061c5dc  // st1w { z28.s-z31.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 11b\n"
       "b 23f\n"
@@ -248,16 +245,16 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
-      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
-      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
-      ".inst 0xa060c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
-      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
-      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
-      ".inst 0xa061c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa062c5dc  // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa062c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 13b\n"
       "b 23f\n"
@@ -275,32 +272,32 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 16f\n"
       "15:"  // Store to output array: Accumulator row 0 loop
-      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
-      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
-      ".inst 0xa1604350  // st1w { z16.s, z24.s }, p8, [x26]\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
       "add x26, x26, x23\n"
-      ".inst 0xa1604351  // st1w { z17.s, z25.s }, p8, [x26]\n"
+      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
       "add x26, x26, x23\n"
       "add x12, x12, #0x4\n"
-      ".inst 0xa1604352  // st1w { z18.s, z26.s }, p8, [x26]\n"
+      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
       "add x26, x26, x23\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xa1604353  // st1w { z19.s, z27.s }, p8, [x26]\n"
+      ".inst 0xa1604347  // st1w { z7.s, z15.s }, p8, [x26]\n"
       "add x26, x26, x23\n"
       "blt 15b\n"
       "16:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 17f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
-      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
-      ".inst 0xa1604340  // st1w { z0.s, z8.s }, p8, [x26]\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
       "add x26, x26, x23\n"
       "beq 17f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa1604341  // st1w { z1.s, z9.s }, p8, [x26]\n"
+      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
       "add x26, x26, x23\n"
       "beq 17f\n"
-      ".inst 0xa1604342  // st1w { z2.s, z10.s }, p8, [x26]\n"
+      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
       "add x26, x26, x23\n"
       "17:"  // Store to output array: Accumulator row 0 oddments: End
       "subs x25, x25, x22\n"
@@ -328,30 +325,30 @@
       "19:"  // Store to output array: Accumulator row 1 oddments
       "cbz x20, 20f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      ".inst 0xa1604344  // st1w { z4.s, z12.s }, p8, [x26]\n"
+      ".inst 0xc0860440  // mova { z0.s-z3.s }, za2h.s[x12]\n"
+      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
+      ".inst 0xa1604340  // st1w { z0.s, z8.s }, p8, [x26]\n"
       "add x26, x26, x23\n"
       "beq 20f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xa1604345  // st1w { z5.s, z13.s }, p8, [x26]\n"
+      ".inst 0xa1604341  // st1w { z1.s, z9.s }, p8, [x26]\n"
       "add x26, x26, x23\n"
       "beq 20f\n"
-      ".inst 0xa1604346  // st1w { z6.s, z14.s }, p8, [x26]\n"
+      ".inst 0xa1604342  // st1w { z2.s, z10.s }, p8, [x26]\n"
       "20:"  // Store to output array: Accumulator row 1 oddments: End
       "21:"  // Store to output array: End
       "tbz x16, #0, 23f\n"
       "mov x12, #0x0\n"
       "cntw x20\n"
       "22:"  // Store to output array: Refill accumulators: Loop
-      ".inst 0xa040c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
-      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa041c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa042c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
-      ".inst 0xa043c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa040c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa041c5e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840481  // mova za1h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa042c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c5e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -375,4 +372,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
index be1106d..ce10ab3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include <cstdint>
 #include "../std_transforms_sme.hpp"
@@ -32,7 +32,7 @@
 {
 
 // Implementations
-void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
 
 class cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL
 {
@@ -40,7 +40,7 @@
   typedef int8_t operand_type;
   typedef int32_t result_type;
 
-  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer);
 
   /* Kernel blocking parameters */
   static unsigned int out_height()
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 4, 1, 4> transforms = {};
 
-  cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
index d863b6c..a23c44b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -32,10 +31,8 @@
 
 namespace arm_gemm {
 
-void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer)
 {
-  ARM_COMPUTE_UNUSED(act);
-
   struct KernelArgs
   {
     KernelArgs(
@@ -94,14 +91,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "1:"  // Initial accumulator load from buffer: Loop
-      ".inst 0xa040c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
-      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
-      ".inst 0xa041c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
-      ".inst 0xa042c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
-      ".inst 0xa043c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa040c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840700  // mova za0h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa041c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840703  // mova za3h.s[x12], { z24.s-z27.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -119,11 +116,11 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      "ldnt1w { z15.s }, p0/Z, [x20, x10, LSL #2]\n"
-      ".inst 0xc09025e0  // addha za0.s, p1/M, p1/M, z15.s\n"
-      ".inst 0xc09025e1  // addha za1.s, p1/M, p1/M, z15.s\n"
-      ".inst 0xc09025e2  // addha za2.s, p1/M, p1/M, z15.s\n"
-      ".inst 0xc09025e3  // addha za3.s, p1/M, p1/M, z15.s\n"
+      "ldnt1w { z17.s }, p0/Z, [x20, x10, LSL #2]\n"
+      ".inst 0xc0902620  // addha za0.s, p1/M, p1/M, z17.s\n"
+      ".inst 0xc0902621  // addha za1.s, p1/M, p1/M, z17.s\n"
+      ".inst 0xc0902622  // addha za2.s, p1/M, p1/M, z17.s\n"
+      ".inst 0xc0902623  // addha za3.s, p1/M, p1/M, z17.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x10\n"
       "mov x21, x11\n"
@@ -146,75 +143,75 @@
       "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
       "cbz x22, 8f\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa0408370  // ld1b { z16.b-z19.b }, pn8.b/Z, [x27]\n"
-      "ldnt1b { z7.b }, p1/Z, [x23]\n"
-      ".inst 0xa041837c  // ld1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
-      "ldnt1b { z13.b }, p1/Z, [x23, #1, MUL VL]\n"
-      ".inst 0xa0428360  // ld1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
-      "ldnt1b { z12.b }, p1/Z, [x23, #2, MUL VL]\n"
-      ".inst 0xa0438378  // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xa1408372  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
+      "ldnt1b { z12.b }, p1/Z, [x23]\n"
+      ".inst 0xa1418370  // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      "ldnt1b { z5.b }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa1428363  // ld1b { z3.b, z7.b, z11.b, z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1b { z4.b }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa1438362  // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
       "addvl x27, x27, #16\n"
-      "ldnt1b { z23.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "ldnt1b { z19.b }, p1/Z, [x23, #3, MUL VL]\n"
       "addvl x23, x23, #4\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0xa0872600  // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n"
+      ".inst 0xa08c2640  // smopa za0.s, p1/M, p1/M, z18.b, z12.b\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa0872621  // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n"
-      ".inst 0xa0872642  // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n"
-      ".inst 0xa0872663  // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n"
-      ".inst 0xa0408370  // ld1b { z16.b-z19.b }, pn8.b/Z, [x27]\n"
-      ".inst 0xa08d2780  // smopa za0.s, p1/M, p1/M, z28.b, z13.b\n"
-      "ldnt1b { z7.b }, p1/Z, [x23]\n"
-      ".inst 0xa08d27a1  // smopa za1.s, p1/M, p1/M, z29.b, z13.b\n"
-      ".inst 0xa08d27c2  // smopa za2.s, p1/M, p1/M, z30.b, z13.b\n"
-      ".inst 0xa08d27e3  // smopa za3.s, p1/M, p1/M, z31.b, z13.b\n"
-      ".inst 0xa041837c  // ld1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0xa08c2400  // smopa za0.s, p1/M, p1/M, z0.b, z12.b\n"
-      "ldnt1b { z13.b }, p1/Z, [x23, #1, MUL VL]\n"
-      ".inst 0xa08c2421  // smopa za1.s, p1/M, p1/M, z1.b, z12.b\n"
-      ".inst 0xa08c2442  // smopa za2.s, p1/M, p1/M, z2.b, z12.b\n"
-      ".inst 0xa08c2463  // smopa za3.s, p1/M, p1/M, z3.b, z12.b\n"
-      ".inst 0xa0428360  // ld1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
-      "ldnt1b { z12.b }, p1/Z, [x23, #2, MUL VL]\n"
-      ".inst 0xa0972700  // smopa za0.s, p1/M, p1/M, z24.b, z23.b\n"
-      ".inst 0xa0972721  // smopa za1.s, p1/M, p1/M, z25.b, z23.b\n"
-      ".inst 0xa0972742  // smopa za2.s, p1/M, p1/M, z26.b, z23.b\n"
-      ".inst 0xa0972763  // smopa za3.s, p1/M, p1/M, z27.b, z23.b\n"
-      ".inst 0xa0438378  // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xa08c26c1  // smopa za1.s, p1/M, p1/M, z22.b, z12.b\n"
+      ".inst 0xa08c2742  // smopa za2.s, p1/M, p1/M, z26.b, z12.b\n"
+      ".inst 0xa08c27c3  // smopa za3.s, p1/M, p1/M, z30.b, z12.b\n"
+      ".inst 0xa1408372  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
+      ".inst 0xa0852600  // smopa za0.s, p1/M, p1/M, z16.b, z5.b\n"
+      "ldnt1b { z12.b }, p1/Z, [x23]\n"
+      ".inst 0xa0852681  // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+      ".inst 0xa0852702  // smopa za2.s, p1/M, p1/M, z24.b, z5.b\n"
+      ".inst 0xa0852783  // smopa za3.s, p1/M, p1/M, z28.b, z5.b\n"
+      ".inst 0xa1418370  // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa0842460  // smopa za0.s, p1/M, p1/M, z3.b, z4.b\n"
+      "ldnt1b { z5.b }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa08424e1  // smopa za1.s, p1/M, p1/M, z7.b, z4.b\n"
+      ".inst 0xa0842562  // smopa za2.s, p1/M, p1/M, z11.b, z4.b\n"
+      ".inst 0xa08425e3  // smopa za3.s, p1/M, p1/M, z15.b, z4.b\n"
+      ".inst 0xa1428363  // ld1b { z3.b, z7.b, z11.b, z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1b { z4.b }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa0932440  // smopa za0.s, p1/M, p1/M, z2.b, z19.b\n"
+      ".inst 0xa09324c1  // smopa za1.s, p1/M, p1/M, z6.b, z19.b\n"
+      ".inst 0xa0932542  // smopa za2.s, p1/M, p1/M, z10.b, z19.b\n"
+      ".inst 0xa09325c3  // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+      ".inst 0xa1438362  // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
       "addvl x27, x27, #16\n"
-      "ldnt1b { z23.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "ldnt1b { z19.b }, p1/Z, [x23, #3, MUL VL]\n"
       "addvl x23, x23, #4\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0xa0872600  // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n"
-      ".inst 0xa0872621  // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n"
-      ".inst 0xa0872642  // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n"
-      ".inst 0xa0872663  // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n"
-      ".inst 0xa08d2780  // smopa za0.s, p1/M, p1/M, z28.b, z13.b\n"
-      ".inst 0xa08d27a1  // smopa za1.s, p1/M, p1/M, z29.b, z13.b\n"
-      ".inst 0xa08d27c2  // smopa za2.s, p1/M, p1/M, z30.b, z13.b\n"
-      ".inst 0xa08d27e3  // smopa za3.s, p1/M, p1/M, z31.b, z13.b\n"
-      ".inst 0xa08c2400  // smopa za0.s, p1/M, p1/M, z0.b, z12.b\n"
-      ".inst 0xa08c2421  // smopa za1.s, p1/M, p1/M, z1.b, z12.b\n"
-      ".inst 0xa08c2442  // smopa za2.s, p1/M, p1/M, z2.b, z12.b\n"
-      ".inst 0xa08c2463  // smopa za3.s, p1/M, p1/M, z3.b, z12.b\n"
-      ".inst 0xa0972700  // smopa za0.s, p1/M, p1/M, z24.b, z23.b\n"
-      ".inst 0xa0972721  // smopa za1.s, p1/M, p1/M, z25.b, z23.b\n"
-      ".inst 0xa0972742  // smopa za2.s, p1/M, p1/M, z26.b, z23.b\n"
-      ".inst 0xa0972763  // smopa za3.s, p1/M, p1/M, z27.b, z23.b\n"
+      ".inst 0xa08c2640  // smopa za0.s, p1/M, p1/M, z18.b, z12.b\n"
+      ".inst 0xa08c26c1  // smopa za1.s, p1/M, p1/M, z22.b, z12.b\n"
+      ".inst 0xa08c2742  // smopa za2.s, p1/M, p1/M, z26.b, z12.b\n"
+      ".inst 0xa08c27c3  // smopa za3.s, p1/M, p1/M, z30.b, z12.b\n"
+      ".inst 0xa0852600  // smopa za0.s, p1/M, p1/M, z16.b, z5.b\n"
+      ".inst 0xa0852681  // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+      ".inst 0xa0852702  // smopa za2.s, p1/M, p1/M, z24.b, z5.b\n"
+      ".inst 0xa0852783  // smopa za3.s, p1/M, p1/M, z28.b, z5.b\n"
+      ".inst 0xa0842460  // smopa za0.s, p1/M, p1/M, z3.b, z4.b\n"
+      ".inst 0xa08424e1  // smopa za1.s, p1/M, p1/M, z7.b, z4.b\n"
+      ".inst 0xa0842562  // smopa za2.s, p1/M, p1/M, z11.b, z4.b\n"
+      ".inst 0xa08425e3  // smopa za3.s, p1/M, p1/M, z15.b, z4.b\n"
+      ".inst 0xa0932440  // smopa za0.s, p1/M, p1/M, z2.b, z19.b\n"
+      ".inst 0xa09324c1  // smopa za1.s, p1/M, p1/M, z6.b, z19.b\n"
+      ".inst 0xa0932542  // smopa za2.s, p1/M, p1/M, z10.b, z19.b\n"
+      ".inst 0xa09325c3  // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
       "8:"  // K oddments
       "cbz x21, 10f\n"
       "9:"  // K oddments: Loop
-      ".inst 0xa0408370  // ld1b { z16.b-z19.b }, pn8.b/Z, [x27]\n"
+      ".inst 0xa0408368  // ld1b { z8.b-z11.b }, pn8.b/Z, [x27]\n"
       "subs x21, x21, #0x1\n"
       "addvl x27, x27, #4\n"
-      "ld1b { z7.b }, p1/Z, [x23]\n"
+      "ld1b { z15.b }, p1/Z, [x23]\n"
       "addvl x23, x23, #1\n"
-      ".inst 0xa0872600  // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n"
-      ".inst 0xa0872621  // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n"
-      ".inst 0xa0872642  // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n"
-      ".inst 0xa0872663  // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n"
+      ".inst 0xa08f2500  // smopa za0.s, p1/M, p1/M, z8.b, z15.b\n"
+      ".inst 0xa08f2521  // smopa za1.s, p1/M, p1/M, z9.b, z15.b\n"
+      ".inst 0xa08f2542  // smopa za2.s, p1/M, p1/M, z10.b, z15.b\n"
+      ".inst 0xa08f2563  // smopa za3.s, p1/M, p1/M, z11.b, z15.b\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
       "tbz x16, #1, 14f\n"
@@ -222,25 +219,25 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
-      ".inst 0xa040c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
-      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
-      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
-      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
-      ".inst 0xa041c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
-      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
-      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
-      ".inst 0xa042c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
-      ".inst 0xa043c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa040c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa060c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+      ".inst 0xa060c1c4  // st1w { z4.s-z7.s }, pn8.b, [x14]\n"
       "addvl x15, x15, #16\n"
-      ".inst 0xa061c1c8  // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n"
-      ".inst 0xa062c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c1d0  // st1w { z16.s-z19.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa061c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c1c8  // st1w { z8.s-z11.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1d4  // st1w { z20.s-z23.s }, pn8.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 11b\n"
       "b 29f\n"
@@ -248,12 +245,12 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
-      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
-      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
-      ".inst 0xa060c1c8  // st1w { z8.s-z11.s }, pn8.b, [x14]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa060c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
       ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
       ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
-      ".inst 0xa061c1c4  // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa061c1c8  // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       ".inst 0xa062c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n"
@@ -275,30 +272,30 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 16f\n"
       "15:"  // Store to output array: Accumulator row 0 loop
-      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
-      "st1w { z28.s }, p0, [x26]\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      "st1w { z8.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
-      "st1w { z29.s }, p0, [x26]\n"
+      "st1w { z9.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "add x12, x12, #0x4\n"
-      "st1w { z30.s }, p0, [x26]\n"
+      "st1w { z10.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z31.s }, p0, [x26]\n"
+      "st1w { z11.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "blt 15b\n"
       "16:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 17f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
-      "st1w { z8.s }, p0, [x26]\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      "st1w { z4.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "beq 17f\n"
       "subs x20, x20, #0x1\n"
-      "st1w { z9.s }, p0, [x26]\n"
+      "st1w { z5.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "beq 17f\n"
-      "st1w { z10.s }, p0, [x26]\n"
+      "st1w { z6.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "17:"  // Store to output array: Accumulator row 0 oddments: End
       "subs x25, x25, x22\n"
@@ -310,30 +307,30 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 19f\n"
       "18:"  // Store to output array: Accumulator row 1 loop
-      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
-      "st1w { z0.s }, p0, [x26]\n"
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
-      "st1w { z1.s }, p0, [x26]\n"
+      "st1w { z17.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "add x12, x12, #0x4\n"
-      "st1w { z2.s }, p0, [x26]\n"
+      "st1w { z18.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z3.s }, p0, [x26]\n"
+      "st1w { z19.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "blt 18b\n"
       "19:"  // Store to output array: Accumulator row 1 oddments
       "cbz x20, 20f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
-      "st1w { z16.s }, p0, [x26]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      "st1w { z20.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "beq 20f\n"
       "subs x20, x20, #0x1\n"
-      "st1w { z17.s }, p0, [x26]\n"
+      "st1w { z21.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "beq 20f\n"
-      "st1w { z18.s }, p0, [x26]\n"
+      "st1w { z22.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "20:"  // Store to output array: Accumulator row 1 oddments: End
       "subs x25, x25, x22\n"
@@ -345,30 +342,30 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 22f\n"
       "21:"  // Store to output array: Accumulator row 2 loop
-      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
-      "st1w { z16.s }, p0, [x26]\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      "st1w { z24.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
-      "st1w { z17.s }, p0, [x26]\n"
+      "st1w { z25.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "add x12, x12, #0x4\n"
-      "st1w { z18.s }, p0, [x26]\n"
+      "st1w { z26.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z19.s }, p0, [x26]\n"
+      "st1w { z27.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "blt 21b\n"
       "22:"  // Store to output array: Accumulator row 2 oddments
       "cbz x20, 23f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860440  // mova { z0.s-z3.s }, za2h.s[x12]\n"
-      "st1w { z0.s }, p0, [x26]\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "beq 23f\n"
       "subs x20, x20, #0x1\n"
-      "st1w { z1.s }, p0, [x26]\n"
+      "st1w { z17.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "beq 23f\n"
-      "st1w { z2.s }, p0, [x26]\n"
+      "st1w { z18.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "23:"  // Store to output array: Accumulator row 2 oddments: End
       "subs x25, x25, x22\n"
@@ -380,44 +377,44 @@
       "and x20, x20, #0x3\n"
       "cbz x21, 25f\n"
       "24:"  // Store to output array: Accumulator row 3 loop
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      "st1w { z12.s }, p0, [x26]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "st1w { z16.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
-      "st1w { z13.s }, p0, [x26]\n"
+      "st1w { z17.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "add x12, x12, #0x4\n"
-      "st1w { z14.s }, p0, [x26]\n"
+      "st1w { z18.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "cmp x12, x21, LSL #2\n"
-      "st1w { z15.s }, p0, [x26]\n"
+      "st1w { z19.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "blt 24b\n"
       "25:"  // Store to output array: Accumulator row 3 oddments
       "cbz x20, 26f\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
-      "st1w { z16.s }, p0, [x26]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      "st1w { z12.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "beq 26f\n"
       "subs x20, x20, #0x1\n"
-      "st1w { z17.s }, p0, [x26]\n"
+      "st1w { z13.s }, p0, [x26]\n"
       "add x26, x26, x23\n"
       "beq 26f\n"
-      "st1w { z18.s }, p0, [x26]\n"
+      "st1w { z14.s }, p0, [x26]\n"
       "26:"  // Store to output array: Accumulator row 3 oddments: End
       "27:"  // Store to output array: End
       "tbz x16, #0, 29f\n"
       "mov x12, #0x0\n"
       "cntw x20\n"
       "28:"  // Store to output array: Refill accumulators: Loop
-      ".inst 0xa040c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
-      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa040c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
       ".inst 0xa041c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
       ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xa042c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
-      ".inst 0xa043c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -441,4 +438,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
index c7bd38d..fb84883 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include <cstdint>
 #include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 1, 4, 4, true> transforms = {};
 
-  cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
index d868ed2..96247d2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -100,14 +99,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "1:"  // Initial accumulator load from buffer: Loop
-      ".inst 0xa040c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13]\n"
-      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xa041c5bc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
-      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
-      ".inst 0xa042c5bc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
-      ".inst 0xc0840782  // mova za2h.s[x12], { z28.s-z31.s }\n"
-      ".inst 0xa043c5a4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
-      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa040c5a4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c5a8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840501  // mova za1h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa042c5a8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840502  // mova za2h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa043c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x13, x13, #16\n"
@@ -125,11 +124,11 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      ".inst 0xa01cc299  // ldnt1w { z24.s-z27.s }, p8/Z, [x20, x28, LSL #2]\n"
-      ".inst 0xc0902700  // addha za0.s, p1/M, p1/M, z24.s\n"
-      ".inst 0xc0902721  // addha za1.s, p1/M, p1/M, z25.s\n"
-      ".inst 0xc0902742  // addha za2.s, p1/M, p1/M, z26.s\n"
-      ".inst 0xc0902763  // addha za3.s, p1/M, p1/M, z27.s\n"
+      ".inst 0xa11cc289  // ldnt1w { z1.s, z5.s, z9.s, z13.s }, p8/Z, [x20, x28, LSL #2]\n"
+      ".inst 0xc0902420  // addha za0.s, p1/M, p1/M, z1.s\n"
+      ".inst 0xc09024a1  // addha za1.s, p1/M, p1/M, z5.s\n"
+      ".inst 0xc0902522  // addha za2.s, p1/M, p1/M, z9.s\n"
+      ".inst 0xc09025a3  // addha za3.s, p1/M, p1/M, z13.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x28\n"
       "mov x21, x9\n"
@@ -152,107 +151,107 @@
       "madd x23, x28, x20, x23\n"  // bptr = B + n * kstride_bytes
       "cbz x22, 8f\n"
       "subs x22, x22, #0x1\n"
-      "ld1b { z10.b }, p1/Z, [x25]\n"
-      ".inst 0xa04086fd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23]\n"
-      "ld1b { z16.b }, p1/Z, [x25, #1, MUL VL]\n"
-      ".inst 0xa04186ed  // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      "ld1b { z21.b }, p1/Z, [x25, #2, MUL VL]\n"
-      ".inst 0xa04286f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
-      "ld1b { z19.b }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1b { z20.b }, p1/Z, [x25]\n"
+      ".inst 0xa04086e5  // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+      "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+      ".inst 0xa04186f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+      ".inst 0xa04286fd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
       "addvl x25, x25, #4\n"
-      ".inst 0xa04386e1  // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      ".inst 0xa04386f1  // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
       "addvl x23, x23, #16\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0xa1bc2540  // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+      ".inst 0xa1a42680  // umopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa1bd2541  // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
-      ".inst 0xa1be2542  // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
-      ".inst 0xa1bf2543  // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
-      "ld1b { z10.b }, p1/Z, [x25]\n"
-      ".inst 0xa1ac2600  // umopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
-      ".inst 0xa04086fd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23]\n"
-      ".inst 0xa1ad2601  // umopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
-      ".inst 0xa1ae2602  // umopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
-      ".inst 0xa1af2603  // umopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
-      "ld1b { z16.b }, p1/Z, [x25, #1, MUL VL]\n"
-      ".inst 0xa1b826a0  // umopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
-      ".inst 0xa04186ed  // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      ".inst 0xa1b926a1  // umopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
-      ".inst 0xa1ba26a2  // umopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
-      ".inst 0xa1bb26a3  // umopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
-      "ld1b { z21.b }, p1/Z, [x25, #2, MUL VL]\n"
-      ".inst 0xa04286f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
-      ".inst 0xa1a02660  // umopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
-      ".inst 0xa1a12661  // umopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
-      ".inst 0xa1a22662  // umopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
-      ".inst 0xa1a32663  // umopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
-      "ld1b { z19.b }, p1/Z, [x25, #3, MUL VL]\n"
+      ".inst 0xa1a52681  // umopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+      ".inst 0xa1a62682  // umopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+      ".inst 0xa1a72683  // umopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+      "ld1b { z20.b }, p1/Z, [x25]\n"
+      ".inst 0xa1b82560  // umopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+      ".inst 0xa04086e5  // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa1b92561  // umopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa1ba2562  // umopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+      ".inst 0xa1bb2563  // umopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+      "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n"
+      ".inst 0xa1bc2440  // umopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+      ".inst 0xa04186f9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa1bd2441  // umopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+      ".inst 0xa1be2442  // umopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+      ".inst 0xa1bf2443  // umopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+      "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n"
+      ".inst 0xa04286fd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xa1b025c0  // umopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+      ".inst 0xa1b125c1  // umopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+      ".inst 0xa1b225c2  // umopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+      ".inst 0xa1b325c3  // umopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
+      "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n"
       "addvl x25, x25, #4\n"
-      ".inst 0xa04386e1  // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      ".inst 0xa04386f1  // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
       "addvl x23, x23, #16\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0xa1bc2540  // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
-      ".inst 0xa1bd2541  // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
-      ".inst 0xa1be2542  // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
-      ".inst 0xa1bf2543  // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
-      ".inst 0xa1ac2600  // umopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
-      ".inst 0xa1ad2601  // umopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
-      ".inst 0xa1ae2602  // umopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
-      ".inst 0xa1af2603  // umopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
-      ".inst 0xa1b826a0  // umopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
-      ".inst 0xa1b926a1  // umopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
-      ".inst 0xa1ba26a2  // umopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
-      ".inst 0xa1bb26a3  // umopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
-      ".inst 0xa1a02660  // umopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
-      ".inst 0xa1a12661  // umopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
-      ".inst 0xa1a22662  // umopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
-      ".inst 0xa1a32663  // umopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
+      ".inst 0xa1a42680  // umopa za0.s, p1/M, p1/M, z20.b, z4.b\n"
+      ".inst 0xa1a52681  // umopa za1.s, p1/M, p1/M, z20.b, z5.b\n"
+      ".inst 0xa1a62682  // umopa za2.s, p1/M, p1/M, z20.b, z6.b\n"
+      ".inst 0xa1a72683  // umopa za3.s, p1/M, p1/M, z20.b, z7.b\n"
+      ".inst 0xa1b82560  // umopa za0.s, p1/M, p1/M, z11.b, z24.b\n"
+      ".inst 0xa1b92561  // umopa za1.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa1ba2562  // umopa za2.s, p1/M, p1/M, z11.b, z26.b\n"
+      ".inst 0xa1bb2563  // umopa za3.s, p1/M, p1/M, z11.b, z27.b\n"
+      ".inst 0xa1bc2440  // umopa za0.s, p1/M, p1/M, z2.b, z28.b\n"
+      ".inst 0xa1bd2441  // umopa za1.s, p1/M, p1/M, z2.b, z29.b\n"
+      ".inst 0xa1be2442  // umopa za2.s, p1/M, p1/M, z2.b, z30.b\n"
+      ".inst 0xa1bf2443  // umopa za3.s, p1/M, p1/M, z2.b, z31.b\n"
+      ".inst 0xa1b025c0  // umopa za0.s, p1/M, p1/M, z14.b, z16.b\n"
+      ".inst 0xa1b125c1  // umopa za1.s, p1/M, p1/M, z14.b, z17.b\n"
+      ".inst 0xa1b225c2  // umopa za2.s, p1/M, p1/M, z14.b, z18.b\n"
+      ".inst 0xa1b325c3  // umopa za3.s, p1/M, p1/M, z14.b, z19.b\n"
       "8:"  // K oddments
       "cbz x21, 10f\n"
       "9:"  // K oddments: Loop
-      "ld1b { z10.b }, p1/Z, [x25]\n"
+      "ld1b { z16.b }, p1/Z, [x25]\n"
       "subs x21, x21, #0x1\n"
       "addvl x25, x25, #1\n"
-      ".inst 0xa04086fc  // ld1b { z28.b-z31.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa04086e4  // ld1b { z4.b-z7.b }, pn9.b/Z, [x23]\n"
       "addvl x23, x23, #4\n"
-      ".inst 0xa1bc2540  // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
-      ".inst 0xa1bd2541  // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
-      ".inst 0xa1be2542  // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
-      ".inst 0xa1bf2543  // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+      ".inst 0xa1a42600  // umopa za0.s, p1/M, p1/M, z16.b, z4.b\n"
+      ".inst 0xa1a52601  // umopa za1.s, p1/M, p1/M, z16.b, z5.b\n"
+      ".inst 0xa1a62602  // umopa za2.s, p1/M, p1/M, z16.b, z6.b\n"
+      ".inst 0xa1a72603  // umopa za3.s, p1/M, p1/M, z16.b, z7.b\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
-      "ld1w { z14.s }, p1/Z, [x25]\n"
+      "ld1w { z15.s }, p1/Z, [x25]\n"
       "addvl x25, x25, #1\n"
-      ".inst 0xc09125c0  // addva za0.s, p1/M, p1/M, z14.s\n"
-      ".inst 0xc09125c1  // addva za1.s, p1/M, p1/M, z14.s\n"
-      ".inst 0xc09125c2  // addva za2.s, p1/M, p1/M, z14.s\n"
-      ".inst 0xc09125c3  // addva za3.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125e0  // addva za0.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e1  // addva za1.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e2  // addva za2.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e3  // addva za3.s, p1/M, p1/M, z15.s\n"
       "tbz x14, #1, 14f\n"
       "tbz x14, #0, 12f\n"
       "mov x12, #0x0\n"
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
-      ".inst 0xa040c5b8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x13]\n"
-      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
-      ".inst 0xc0840700  // mova za0h.s[x12], { z24.s-z27.s }\n"
-      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
-      ".inst 0xa041c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
-      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      ".inst 0xa042c5a4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
-      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
-      ".inst 0xa043c5a4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
-      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa040c5a0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
+      ".inst 0xc0840400  // mova za0h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5a0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa060c57c  // st1w { z28.s-z31.s }, pn9.b, [x11]\n"
+      ".inst 0xa060c578  // st1w { z24.s-z27.s }, pn9.b, [x11]\n"
       "addvl x13, x13, #16\n"
-      ".inst 0xa061c568  // st1w { z8.s-z11.s }, pn9.b, [x11, #0x4, MUL VL]\n"
-      ".inst 0xa062c578  // st1w { z24.s-z27.s }, pn9.b, [x11, #0x8, MUL VL]\n"
-      ".inst 0xa063c56c  // st1w { z12.s-z15.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+      ".inst 0xa061c564  // st1w { z4.s-z7.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+      ".inst 0xa062c574  // st1w { z20.s-z23.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa063c560  // st1w { z0.s-z3.s }, pn9.b, [x11, #0xc, MUL VL]\n"
       "addvl x11, x11, #16\n"
       "blt 11b\n"
       "b 21f\n"
@@ -260,16 +259,16 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
-      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
-      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
-      ".inst 0xa060c57c  // st1w { z28.s-z31.s }, pn9.b, [x11]\n"
-      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
-      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
-      ".inst 0xa061c560  // st1w { z0.s-z3.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xa060c564  // st1w { z4.s-z7.s }, pn9.b, [x11]\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
+      ".inst 0xa061c574  // st1w { z20.s-z23.s }, pn9.b, [x11, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa062c568  // st1w { z8.s-z11.s }, pn9.b, [x11, #0x8, MUL VL]\n"
-      ".inst 0xa063c570  // st1w { z16.s-z19.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+      ".inst 0xa062c56c  // st1w { z12.s-z15.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa063c568  // st1w { z8.s-z11.s }, pn9.b, [x11, #0xc, MUL VL]\n"
       "addvl x11, x11, #16\n"
       "blt 13b\n"
       "b 21f\n"
@@ -277,17 +276,17 @@
       "ldr x24, [%x[args], %[offsetof_C]]\n"
       "add x24, x24, x28\n"  // C += n
       "sub x23, x10, x9\n"
-      "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
       "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
       "madd x24, x9, x22, x24\n"  // C += m * ldc
-      "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
-      "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
-      "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
-      "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
-      "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
-      "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
-      "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
-      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
       "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
       "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
       "tbz x14, #2, 15f\n"
@@ -295,10 +294,10 @@
       "add x21, x21, x28\n"
       "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
       "add x20, x20, x21, LSL #2\n"
-      ".inst 0xa040c28c  // ld1w { z12.s-z15.s }, p8/Z, [x20]\n"
+      ".inst 0xa040c284  // ld1w { z4.s-z7.s }, p8/Z, [x20]\n"
       "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
       "add x20, x20, x21, LSL #2\n"
-      ".inst 0xa040c284  // ld1w { z4.s-z7.s }, p8/Z, [x20]\n"
+      ".inst 0xa040c28c  // ld1w { z12.s-z15.s }, p8/Z, [x20]\n"
       "15:"  // Store to output array: Load per-channel parameters: End
       "cntw x20\n"
       "whilelt p0.b, x28, x27\n"
@@ -311,22 +310,22 @@
       "16:"  // Store to output array: Accumulator row 0 loop
       ".inst 0xc086001a  // mova { z26.s-z27.s }, za0h.s[x12, 0:1]\n"
       ".inst 0xc086005c  // mova { z28.s-z29.s }, za1h.s[x12, 0:1]\n"
-      ".inst 0xc1aca41a  // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+      ".inst 0xc1a4a41a  // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
       ".inst 0xc0860096  // mova { z22.s-z23.s }, za2h.s[x12, 0:1]\n"
       ".inst 0xc08600d0  // mova { z16.s-z17.s }, za3h.s[x12, 0:1]\n"
-      ".inst 0xc1ada41c  // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
-      ".inst 0xc1aea416  // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+      ".inst 0xc1a5a41c  // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
+      ".inst 0xc1a6a416  // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
       "add x12, x12, #0x2\n"
       "cmp x12, x21, LSL #1\n"
-      ".inst 0xc1afa410  // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
-      ".inst 0xc1a4a23a  // srshl { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
-      ".inst 0xc1a5a23c  // srshl { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
-      ".inst 0xc1a6a236  // srshl { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
-      ".inst 0xc1a7a230  // srshl { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
-      ".inst 0xc1a1a31a  // add { z26.s-z27.s }, { z26.s-z27.s }, z1.s\n"
-      ".inst 0xc1a1a31c  // add { z28.s-z29.s }, { z28.s-z29.s }, z1.s\n"
-      ".inst 0xc1a1a316  // add { z22.s-z23.s }, { z22.s-z23.s }, z1.s\n"
-      ".inst 0xc1a1a310  // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
+      ".inst 0xc1a7a410  // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
+      ".inst 0xc1aca23a  // srshl { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+      ".inst 0xc1ada23c  // srshl { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
+      ".inst 0xc1aea236  // srshl { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+      ".inst 0xc1afa230  // srshl { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
+      ".inst 0xc1a0a31a  // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+      ".inst 0xc1a0a31c  // add { z28.s-z29.s }, { z28.s-z29.s }, z0.s\n"
+      ".inst 0xc1a0a316  // add { z22.s-z23.s }, { z22.s-z23.s }, z0.s\n"
+      ".inst 0xc1a0a310  // add { z16.s-z17.s }, { z16.s-z17.s }, z0.s\n"
       ".inst 0xc1b4c6ba  // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
       ".inst 0xc1b4c6bc  // sclamp { z28.s-z29.s }, z21.s, z20.s\n"
       "uzp1 z19.b, z26.b, z28.b\n"
@@ -344,29 +343,29 @@
       "blt 16b\n"
       "17:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 18f\n"
-      ".inst 0xc0860002  // mova { z2.s-z3.s }, za0h.s[x12, 0:1]\n"
+      ".inst 0xc086000a  // mova { z10.s-z11.s }, za0h.s[x12, 0:1]\n"
       ".inst 0xc0860058  // mova { z24.s-z25.s }, za1h.s[x12, 0:1]\n"
-      ".inst 0xc1aca402  // sqdmulh { z2.s-z3.s }, { z2.s-z3.s }, z12.s\n"
-      ".inst 0xc0860090  // mova { z16.s-z17.s }, za2h.s[x12, 0:1]\n"
-      ".inst 0xc08600ca  // mova { z10.s-z11.s }, za3h.s[x12, 0:1]\n"
-      ".inst 0xc1ada418  // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
-      ".inst 0xc1aea410  // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z14.s\n"
-      ".inst 0xc1afa40a  // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z15.s\n"
-      ".inst 0xc1a4a222  // srshl { z2.s-z3.s }, { z2.s-z3.s }, z4.s\n"
-      ".inst 0xc1a5a238  // srshl { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
-      ".inst 0xc1a6a230  // srshl { z16.s-z17.s }, { z16.s-z17.s }, z6.s\n"
-      ".inst 0xc1a7a22a  // srshl { z10.s-z11.s }, { z10.s-z11.s }, z7.s\n"
-      ".inst 0xc1a1a302  // add { z2.s-z3.s }, { z2.s-z3.s }, z1.s\n"
-      ".inst 0xc1a1a318  // add { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
-      ".inst 0xc1a1a310  // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
-      ".inst 0xc1a1a30a  // add { z10.s-z11.s }, { z10.s-z11.s }, z1.s\n"
-      ".inst 0xc1b4c6a2  // sclamp { z2.s-z3.s }, z21.s, z20.s\n"
-      ".inst 0xc1b4c6b8  // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
-      "uzp1 z23.b, z2.b, z24.b\n"
-      ".inst 0xc1b4c6b0  // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+      ".inst 0xc1a4a40a  // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z4.s\n"
+      ".inst 0xc086009a  // mova { z26.s-z27.s }, za2h.s[x12, 0:1]\n"
+      ".inst 0xc08600de  // mova { z30.s-z31.s }, za3h.s[x12, 0:1]\n"
+      ".inst 0xc1a5a418  // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
+      ".inst 0xc1a6a41a  // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z6.s\n"
+      ".inst 0xc1a7a41e  // sqdmulh { z30.s-z31.s }, { z30.s-z31.s }, z7.s\n"
+      ".inst 0xc1aca22a  // srshl { z10.s-z11.s }, { z10.s-z11.s }, z12.s\n"
+      ".inst 0xc1ada238  // srshl { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
+      ".inst 0xc1aea23a  // srshl { z26.s-z27.s }, { z26.s-z27.s }, z14.s\n"
+      ".inst 0xc1afa23e  // srshl { z30.s-z31.s }, { z30.s-z31.s }, z15.s\n"
+      ".inst 0xc1a0a30a  // add { z10.s-z11.s }, { z10.s-z11.s }, z0.s\n"
+      ".inst 0xc1a0a318  // add { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
+      ".inst 0xc1a0a31a  // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n"
+      ".inst 0xc1a0a31e  // add { z30.s-z31.s }, { z30.s-z31.s }, z0.s\n"
       ".inst 0xc1b4c6aa  // sclamp { z10.s-z11.s }, z21.s, z20.s\n"
-      "uzp1 z16.b, z16.b, z10.b\n"
-      "uzp1 z16.b, z23.b, z16.b\n"
+      ".inst 0xc1b4c6b8  // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
+      "uzp1 z17.b, z10.b, z24.b\n"
+      ".inst 0xc1b4c6ba  // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6be  // sclamp { z30.s-z31.s }, z21.s, z20.s\n"
+      "uzp1 z16.b, z26.b, z30.b\n"
+      "uzp1 z16.b, z17.b, z16.b\n"
       "st1b { z16.b }, p0, [x24]\n"
       "18:"  // Store to output array: Accumulator row 0 oddments: End
       "19:"  // Store to output array: End
@@ -374,14 +373,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "20:"  // Store to output array: Refill accumulators: Loop
-      ".inst 0xa040c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13]\n"
-      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa041c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
-      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xa042c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
-      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa043c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
-      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa040c5bc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c5a0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840403  // mova za3h.s[x12], { z0.s-z3.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x13, x13, #16\n"
@@ -405,4 +404,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
index 123405b..f8c375f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include <cstdint>
 #include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 2, 2, 4, true> transforms = {};
 
-  cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
index cb0e952..9a59799 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -100,14 +99,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "1:"  // Initial accumulator load from buffer: Loop
-      ".inst 0xa040c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15]\n"
-      ".inst 0xc0840400  // mova za0h.s[x12], { z0.s-z3.s }\n"
-      ".inst 0xa041c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xa042c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
-      ".inst 0xa043c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa040c5e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa041c5e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -125,11 +124,11 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      ".inst 0xa00a4295  // ldnt1w { z20.s-z21.s }, p8/Z, [x20, x10, LSL #2]\n"
-      ".inst 0xc0902680  // addha za0.s, p1/M, p1/M, z20.s\n"
-      ".inst 0xc09026a1  // addha za1.s, p1/M, p1/M, z21.s\n"
-      ".inst 0xc0902682  // addha za2.s, p1/M, p1/M, z20.s\n"
-      ".inst 0xc09026a3  // addha za3.s, p1/M, p1/M, z21.s\n"
+      ".inst 0xa00a4299  // ldnt1w { z24.s-z25.s }, p8/Z, [x20, x10, LSL #2]\n"
+      ".inst 0xc0902700  // addha za0.s, p1/M, p1/M, z24.s\n"
+      ".inst 0xc0902721  // addha za1.s, p1/M, p1/M, z25.s\n"
+      ".inst 0xc0902702  // addha za2.s, p1/M, p1/M, z24.s\n"
+      ".inst 0xc0902723  // addha za3.s, p1/M, p1/M, z25.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x10\n"
       "mov x21, x11\n"
@@ -152,75 +151,75 @@
       "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
       "cbz x22, 8f\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa040077e  // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n"
-      ".inst 0xa04006f1  // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
-      ".inst 0xa041076e  // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
-      ".inst 0xa04106e9  // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
-      ".inst 0xa0420760  // ld1b { z0.b-z1.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0xa14206fc  // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      ".inst 0xa0430764  // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      ".inst 0xa1400763  // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa14006f9  // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa1410774  // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa04106f7  // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa1420775  // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa14206f8  // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa1430765  // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
       "addvl x27, x27, #8\n"
-      ".inst 0xa14306ea  // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      ".inst 0xa14306ef  // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
       "addvl x23, x23, #8\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0xa1b027c0  // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+      ".inst 0xa1b12460  // umopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa1b127c1  // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
-      ".inst 0xa1b027e2  // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
-      ".inst 0xa1b127e3  // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
-      ".inst 0xa040077e  // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n"
-      ".inst 0xa1a825c0  // umopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
-      ".inst 0xa04006f1  // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
-      ".inst 0xa1a925c1  // umopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
-      ".inst 0xa1a825e2  // umopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
-      ".inst 0xa1a925e3  // umopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
-      ".inst 0xa041076e  // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
-      ".inst 0xa1b42400  // umopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
-      ".inst 0xa04106e9  // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
-      ".inst 0xa1bc2401  // umopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
-      ".inst 0xa1b42422  // umopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
-      ".inst 0xa1bc2423  // umopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
-      ".inst 0xa0420760  // ld1b { z0.b-z1.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0xa14206fc  // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
-      ".inst 0xa1a22480  // umopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
-      ".inst 0xa1aa2481  // umopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
-      ".inst 0xa1a224a2  // umopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
-      ".inst 0xa1aa24a3  // umopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
-      ".inst 0xa0430764  // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
+      ".inst 0xa1b92461  // umopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+      ".inst 0xa1b12562  // umopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+      ".inst 0xa1b92563  // umopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa1400763  // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa1b62680  // umopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+      ".inst 0xa14006f9  // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n"
+      ".inst 0xa1b72681  // umopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+      ".inst 0xa1b62782  // umopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+      ".inst 0xa1b72783  // umopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+      ".inst 0xa1410774  // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n"
+      ".inst 0xa1b026a0  // umopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+      ".inst 0xa04106f7  // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n"
+      ".inst 0xa1b826a1  // umopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+      ".inst 0xa1b027a2  // umopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+      ".inst 0xa1b827a3  // umopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+      ".inst 0xa1420775  // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa14206f8  // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xa1a724a0  // umopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+      ".inst 0xa1af24a1  // umopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+      ".inst 0xa1a725a2  // umopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+      ".inst 0xa1af25a3  // umopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
+      ".inst 0xa1430765  // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n"
       "addvl x27, x27, #8\n"
-      ".inst 0xa14306ea  // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
+      ".inst 0xa14306ef  // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n"
       "addvl x23, x23, #8\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0xa1b027c0  // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
-      ".inst 0xa1b127c1  // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
-      ".inst 0xa1b027e2  // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
-      ".inst 0xa1b127e3  // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
-      ".inst 0xa1a825c0  // umopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
-      ".inst 0xa1a925c1  // umopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
-      ".inst 0xa1a825e2  // umopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
-      ".inst 0xa1a925e3  // umopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
-      ".inst 0xa1b42400  // umopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
-      ".inst 0xa1bc2401  // umopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
-      ".inst 0xa1b42422  // umopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
-      ".inst 0xa1bc2423  // umopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
-      ".inst 0xa1a22480  // umopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
-      ".inst 0xa1aa2481  // umopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
-      ".inst 0xa1a224a2  // umopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
-      ".inst 0xa1aa24a3  // umopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
+      ".inst 0xa1b12460  // umopa za0.s, p1/M, p1/M, z3.b, z17.b\n"
+      ".inst 0xa1b92461  // umopa za1.s, p1/M, p1/M, z3.b, z25.b\n"
+      ".inst 0xa1b12562  // umopa za2.s, p1/M, p1/M, z11.b, z17.b\n"
+      ".inst 0xa1b92563  // umopa za3.s, p1/M, p1/M, z11.b, z25.b\n"
+      ".inst 0xa1b62680  // umopa za0.s, p1/M, p1/M, z20.b, z22.b\n"
+      ".inst 0xa1b72681  // umopa za1.s, p1/M, p1/M, z20.b, z23.b\n"
+      ".inst 0xa1b62782  // umopa za2.s, p1/M, p1/M, z28.b, z22.b\n"
+      ".inst 0xa1b72783  // umopa za3.s, p1/M, p1/M, z28.b, z23.b\n"
+      ".inst 0xa1b026a0  // umopa za0.s, p1/M, p1/M, z21.b, z16.b\n"
+      ".inst 0xa1b826a1  // umopa za1.s, p1/M, p1/M, z21.b, z24.b\n"
+      ".inst 0xa1b027a2  // umopa za2.s, p1/M, p1/M, z29.b, z16.b\n"
+      ".inst 0xa1b827a3  // umopa za3.s, p1/M, p1/M, z29.b, z24.b\n"
+      ".inst 0xa1a724a0  // umopa za0.s, p1/M, p1/M, z5.b, z7.b\n"
+      ".inst 0xa1af24a1  // umopa za1.s, p1/M, p1/M, z5.b, z15.b\n"
+      ".inst 0xa1a725a2  // umopa za2.s, p1/M, p1/M, z13.b, z7.b\n"
+      ".inst 0xa1af25a3  // umopa za3.s, p1/M, p1/M, z13.b, z15.b\n"
       "8:"  // K oddments
       "cbz x21, 10f\n"
       "9:"  // K oddments: Loop
-      ".inst 0xa040077e  // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n"
+      ".inst 0xa1400773  // ld1b { z19.b, z27.b }, pn9.b/Z, [x27]\n"
       "subs x21, x21, #0x1\n"
       "addvl x27, x27, #2\n"
       ".inst 0xa04006f0  // ld1b { z16.b-z17.b }, pn9.b/Z, [x23]\n"
       "addvl x23, x23, #2\n"
-      ".inst 0xa1b027c0  // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
-      ".inst 0xa1b127c1  // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
-      ".inst 0xa1b027e2  // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
-      ".inst 0xa1b127e3  // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+      ".inst 0xa1b02660  // umopa za0.s, p1/M, p1/M, z19.b, z16.b\n"
+      ".inst 0xa1b12661  // umopa za1.s, p1/M, p1/M, z19.b, z17.b\n"
+      ".inst 0xa1b02762  // umopa za2.s, p1/M, p1/M, z27.b, z16.b\n"
+      ".inst 0xa1b12763  // umopa za3.s, p1/M, p1/M, z27.b, z17.b\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
       ".inst 0xa040476e  // ld1w { z14.s-z15.s }, pn9.b/Z, [x27]\n"
@@ -234,25 +233,25 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
-      ".inst 0xa040c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15]\n"
-      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
-      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
-      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
-      ".inst 0xa041c5f8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
-      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
-      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
-      ".inst 0xa042c5f8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
-      ".inst 0xa043c5ec  // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa040c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa060c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14]\n"
+      ".inst 0xa060c5c0  // st1w { z0.s-z3.s }, pn9.b, [x14]\n"
       "addvl x15, x15, #16\n"
-      ".inst 0xa061c5d4  // st1w { z20.s-z23.s }, pn9.b, [x14, #0x4, MUL VL]\n"
-      ".inst 0xa062c5dc  // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa061c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 11b\n"
       "b 24f\n"
@@ -260,16 +259,16 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
-      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
-      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
-      ".inst 0xa060c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14]\n"
-      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
-      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
-      ".inst 0xa061c5c4  // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14]\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa062c5c8  // st1w { z8.s-z11.s }, pn9.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c5cc  // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa062c5d0  // st1w { z16.s-z19.s }, pn9.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c5d4  // st1w { z20.s-z23.s }, pn9.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 13b\n"
       "b 24f\n"
@@ -277,13 +276,13 @@
       "ldr x26, [%x[args], %[offsetof_C]]\n"
       "add x26, x26, x10\n"  // C += n
       "sub x25, x13, x11\n"
-      "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
       "ldr x24, [%x[args], %[offsetof_ldcb]]\n"
       "madd x26, x11, x24, x26\n"  // C += m * ldc
-      "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
-      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
-      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
-      "ld1rw { z11.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
       "ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
       "ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
       "tbz x16, #2, 15f\n"
@@ -291,10 +290,10 @@
       "add x21, x21, x10\n"
       "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
       "add x20, x20, x21, LSL #2\n"
-      ".inst 0xa0404282  // ld1w { z2.s-z3.s }, p8/Z, [x20]\n"
+      ".inst 0xa0404280  // ld1w { z0.s-z1.s }, p8/Z, [x20]\n"
       "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
       "add x20, x20, x21, LSL #2\n"
-      ".inst 0xa0404280  // ld1w { z0.s-z1.s }, p8/Z, [x20]\n"
+      ".inst 0xa0404282  // ld1w { z2.s-z3.s }, p8/Z, [x20]\n"
       "15:"  // Store to output array: Load per-channel parameters: End
       "cntw x23\n"
       "whilelt p0.h, x10, x9\n"
@@ -305,26 +304,26 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 17f\n"
       "16:"  // Store to output array: Accumulator row 0 loop
-      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
-      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
-      ".inst 0xc1a2ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
-      ".inst 0xc1a3ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1a1ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xc1a0aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
-      ".inst 0xc1a1aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
-      ".inst 0xc1abab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
-      ".inst 0xc1abab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
-      ".inst 0xc1b8cf2c  // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
-      ".inst 0xc1b8cf3c  // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
-      "uzp1 z16.h, z12.h, z28.h\n"
+      ".inst 0xc1a2aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc1a3aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1aeab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf28  // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z4.h, z8.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "uzp1 z16.h, z13.h, z29.h\n"
-      "uzp1 z17.h, z14.h, z30.h\n"
+      "uzp1 z16.h, z5.h, z9.h\n"
+      "uzp1 z17.h, z6.h, z10.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "uzp1 z16.h, z15.h, z31.h\n"
+      "uzp1 z16.h, z7.h, z11.h\n"
       "st1b { z17.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "st1b { z16.h }, p0, [x26]\n"
@@ -332,27 +331,27 @@
       "blt 16b\n"
       "17:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 18f\n"
-      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
-      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
-      ".inst 0xc1a2ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
-      ".inst 0xc1a3ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1a0ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+      ".inst 0xc1a1ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1a0aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
-      ".inst 0xc1a1aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
-      ".inst 0xc1abab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
-      ".inst 0xc1abab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
-      ".inst 0xc1b8cf3c  // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
-      ".inst 0xc1b8cf2c  // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
-      "uzp1 z16.h, z28.h, z12.h\n"
+      ".inst 0xc1a2aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
+      ".inst 0xc1a3aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+      ".inst 0xc1aeab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1b8cf28  // sclamp { z8.s-z11.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z8.h, z4.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 18f\n"
       "subs x20, x20, #0x1\n"
-      "uzp1 z16.h, z29.h, z13.h\n"
+      "uzp1 z16.h, z9.h, z5.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 18f\n"
-      "uzp1 z16.h, z30.h, z14.h\n"
+      "uzp1 z16.h, z10.h, z6.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "18:"  // Store to output array: Accumulator row 0 oddments: End
@@ -367,25 +366,25 @@
       "cbz x21, 20f\n"
       "19:"  // Store to output array: Accumulator row 1 loop
       ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
-      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
-      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
-      ".inst 0xc1a3ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
+      ".inst 0xc1a0ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1a1ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z1.s\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xc1a0aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
-      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
-      ".inst 0xc1abab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
-      ".inst 0xc1abab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+      ".inst 0xc1a2aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc1a3aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z3.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1aeab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z14.s\n"
       ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
-      ".inst 0xc1b8cf30  // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
-      "uzp1 z16.h, z4.h, z16.h\n"
+      ".inst 0xc1b8cf34  // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z4.h, z20.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "uzp1 z16.h, z5.h, z17.h\n"
-      "uzp1 z17.h, z6.h, z18.h\n"
+      "uzp1 z16.h, z5.h, z21.h\n"
+      "uzp1 z17.h, z6.h, z22.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "uzp1 z16.h, z7.h, z19.h\n"
+      "uzp1 z16.h, z7.h, z23.h\n"
       "st1b { z17.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "st1b { z16.h }, p0, [x26]\n"
@@ -393,27 +392,27 @@
       "blt 19b\n"
       "20:"  // Store to output array: Accumulator row 1 oddments
       "cbz x20, 21f\n"
-      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
       ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
-      ".inst 0xc1a2ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z2.s\n"
-      ".inst 0xc1a3ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+      ".inst 0xc1a0ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1a1ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1a0aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z0.s\n"
-      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
-      ".inst 0xc1abab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
-      ".inst 0xc1abab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
-      ".inst 0xc1b8cf34  // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+      ".inst 0xc1a2aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc1a3aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+      ".inst 0xc1aeab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+      ".inst 0xc1aeab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z14.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
       ".inst 0xc1b8cf30  // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
-      "uzp1 z16.h, z20.h, z16.h\n"
+      "uzp1 z16.h, z4.h, z16.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 21f\n"
       "subs x20, x20, #0x1\n"
-      "uzp1 z16.h, z21.h, z17.h\n"
+      "uzp1 z16.h, z5.h, z17.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 21f\n"
-      "uzp1 z16.h, z22.h, z18.h\n"
+      "uzp1 z16.h, z6.h, z18.h\n"
       "st1b { z16.h }, p0, [x26]\n"
       "21:"  // Store to output array: Accumulator row 1 oddments: End
       "22:"  // Store to output array: End
@@ -452,4 +451,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
index 2e61cf4..04d1932 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include <cstdint>
 #include "../std_transforms_sme.hpp"
@@ -83,12 +83,11 @@
 
   StdTransformsSME<operand_type, result_type, 4, 1, 4, true> transforms = {};
 
-  cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const CPUInfo *ci)
+  cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const CPUInfo *)
   {
-    ARM_COMPUTE_UNUSED(ci);
   }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
index 8f8886b..0f3346e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef __ARM_FEATURE_SVE
 #ifdef ARM_COMPUTE_ENABLE_SME2
 
 #include "arm_gemm.hpp"
@@ -100,14 +99,14 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "1:"  // Initial accumulator load from buffer: Loop
-      ".inst 0xa040c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
-      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
-      ".inst 0xa041c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
-      ".inst 0xa042c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
-      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
-      ".inst 0xa043c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840703  // mova za3h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa040c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c1f8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa042c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
+      ".inst 0xc0840782  // mova za2h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa043c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -125,11 +124,11 @@
       "ldr x20, [%x[args], %[offsetof_bias]]\n"
       ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
       "cbz x20, 5f\n"
-      "ldnt1w { z15.s }, p0/Z, [x20, x10, LSL #2]\n"
-      ".inst 0xc09025e0  // addha za0.s, p1/M, p1/M, z15.s\n"
-      ".inst 0xc09025e1  // addha za1.s, p1/M, p1/M, z15.s\n"
-      ".inst 0xc09025e2  // addha za2.s, p1/M, p1/M, z15.s\n"
-      ".inst 0xc09025e3  // addha za3.s, p1/M, p1/M, z15.s\n"
+      "ldnt1w { z8.s }, p0/Z, [x20, x10, LSL #2]\n"
+      ".inst 0xc0902500  // addha za0.s, p1/M, p1/M, z8.s\n"
+      ".inst 0xc0902501  // addha za1.s, p1/M, p1/M, z8.s\n"
+      ".inst 0xc0902502  // addha za2.s, p1/M, p1/M, z8.s\n"
+      ".inst 0xc0902503  // addha za3.s, p1/M, p1/M, z8.s\n"
       "4:"  // Prepare accumulators: Test for last block
       "mov x20, x10\n"
       "mov x21, x11\n"
@@ -152,107 +151,107 @@
       "madd x23, x10, x20, x23\n"  // bptr = B + n * kstride_bytes
       "cbz x22, 8f\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa1408372  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
-      "ldnt1b { z0.b }, p1/Z, [x23]\n"
-      ".inst 0xa1418373  // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
-      "ldnt1b { z9.b }, p1/Z, [x23, #1, MUL VL]\n"
-      ".inst 0xa1428370  // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
-      "ldnt1b { z21.b }, p1/Z, [x23, #2, MUL VL]\n"
-      ".inst 0xa1438362  // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xa0408364  // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+      "ldnt1b { z14.b }, p1/Z, [x23]\n"
+      ".inst 0xa0418374  // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa0428378  // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa0438368  // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
       "addvl x27, x27, #16\n"
-      "ldnt1b { z12.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
       "addvl x23, x23, #4\n"
       "ble 7f\n"
       "6:"  // K loop
-      ".inst 0xa1a02640  // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+      ".inst 0xa1ae2480  // umopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
       "subs x22, x22, #0x1\n"
-      ".inst 0xa1a026c1  // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
-      ".inst 0xa1a02742  // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
-      ".inst 0xa1a027c3  // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
-      ".inst 0xa1408372  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
-      ".inst 0xa1a92660  // umopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
-      "ldnt1b { z0.b }, p1/Z, [x23]\n"
-      ".inst 0xa1a926e1  // umopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
-      ".inst 0xa1a92762  // umopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
-      ".inst 0xa1a927e3  // umopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
-      ".inst 0xa1418373  // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
-      ".inst 0xa1b52600  // umopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
-      "ldnt1b { z9.b }, p1/Z, [x23, #1, MUL VL]\n"
-      ".inst 0xa1b52681  // umopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
-      ".inst 0xa1b52702  // umopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
-      ".inst 0xa1b52783  // umopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
-      ".inst 0xa1428370  // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
-      "ldnt1b { z21.b }, p1/Z, [x23, #2, MUL VL]\n"
-      ".inst 0xa1ac2440  // umopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
-      ".inst 0xa1ac24c1  // umopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
-      ".inst 0xa1ac2542  // umopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
-      ".inst 0xa1ac25c3  // umopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
-      ".inst 0xa1438362  // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
+      ".inst 0xa1ae24a1  // umopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+      ".inst 0xa1ae24c2  // umopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+      ".inst 0xa1ae24e3  // umopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+      ".inst 0xa0408364  // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n"
+      ".inst 0xa1bf2680  // umopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+      "ldnt1b { z14.b }, p1/Z, [x23]\n"
+      ".inst 0xa1bf26a1  // umopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+      ".inst 0xa1bf26c2  // umopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+      ".inst 0xa1bf26e3  // umopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+      ".inst 0xa0418374  // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n"
+      ".inst 0xa1ad2700  // umopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+      "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa1ad2721  // umopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+      ".inst 0xa1ad2742  // umopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+      ".inst 0xa1ad2763  // umopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+      ".inst 0xa0428378  // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n"
+      "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa1bd2500  // umopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+      ".inst 0xa1bd2521  // umopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+      ".inst 0xa1bd2542  // umopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+      ".inst 0xa1bd2563  // umopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
+      ".inst 0xa0438368  // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n"
       "addvl x27, x27, #16\n"
-      "ldnt1b { z12.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n"
       "addvl x23, x23, #4\n"
       "bgt 6b\n"
       "7:"  // K loop tail
-      ".inst 0xa1a02640  // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
-      ".inst 0xa1a026c1  // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
-      ".inst 0xa1a02742  // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
-      ".inst 0xa1a027c3  // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
-      ".inst 0xa1a92660  // umopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
-      ".inst 0xa1a926e1  // umopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
-      ".inst 0xa1a92762  // umopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
-      ".inst 0xa1a927e3  // umopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
-      ".inst 0xa1b52600  // umopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
-      ".inst 0xa1b52681  // umopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
-      ".inst 0xa1b52702  // umopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
-      ".inst 0xa1b52783  // umopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
-      ".inst 0xa1ac2440  // umopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
-      ".inst 0xa1ac24c1  // umopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
-      ".inst 0xa1ac2542  // umopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
-      ".inst 0xa1ac25c3  // umopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
+      ".inst 0xa1ae2480  // umopa za0.s, p1/M, p1/M, z4.b, z14.b\n"
+      ".inst 0xa1ae24a1  // umopa za1.s, p1/M, p1/M, z5.b, z14.b\n"
+      ".inst 0xa1ae24c2  // umopa za2.s, p1/M, p1/M, z6.b, z14.b\n"
+      ".inst 0xa1ae24e3  // umopa za3.s, p1/M, p1/M, z7.b, z14.b\n"
+      ".inst 0xa1bf2680  // umopa za0.s, p1/M, p1/M, z20.b, z31.b\n"
+      ".inst 0xa1bf26a1  // umopa za1.s, p1/M, p1/M, z21.b, z31.b\n"
+      ".inst 0xa1bf26c2  // umopa za2.s, p1/M, p1/M, z22.b, z31.b\n"
+      ".inst 0xa1bf26e3  // umopa za3.s, p1/M, p1/M, z23.b, z31.b\n"
+      ".inst 0xa1ad2700  // umopa za0.s, p1/M, p1/M, z24.b, z13.b\n"
+      ".inst 0xa1ad2721  // umopa za1.s, p1/M, p1/M, z25.b, z13.b\n"
+      ".inst 0xa1ad2742  // umopa za2.s, p1/M, p1/M, z26.b, z13.b\n"
+      ".inst 0xa1ad2763  // umopa za3.s, p1/M, p1/M, z27.b, z13.b\n"
+      ".inst 0xa1bd2500  // umopa za0.s, p1/M, p1/M, z8.b, z29.b\n"
+      ".inst 0xa1bd2521  // umopa za1.s, p1/M, p1/M, z9.b, z29.b\n"
+      ".inst 0xa1bd2542  // umopa za2.s, p1/M, p1/M, z10.b, z29.b\n"
+      ".inst 0xa1bd2563  // umopa za3.s, p1/M, p1/M, z11.b, z29.b\n"
       "8:"  // K oddments
       "cbz x21, 10f\n"
       "9:"  // K oddments: Loop
       ".inst 0xa1408372  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n"
       "subs x21, x21, #0x1\n"
       "addvl x27, x27, #4\n"
-      "ld1b { z0.b }, p1/Z, [x23]\n"
+      "ld1b { z15.b }, p1/Z, [x23]\n"
       "addvl x23, x23, #1\n"
-      ".inst 0xa1a02640  // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
-      ".inst 0xa1a026c1  // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
-      ".inst 0xa1a02742  // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
-      ".inst 0xa1a027c3  // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+      ".inst 0xa1af2640  // umopa za0.s, p1/M, p1/M, z18.b, z15.b\n"
+      ".inst 0xa1af26c1  // umopa za1.s, p1/M, p1/M, z22.b, z15.b\n"
+      ".inst 0xa1af2742  // umopa za2.s, p1/M, p1/M, z26.b, z15.b\n"
+      ".inst 0xa1af27c3  // umopa za3.s, p1/M, p1/M, z30.b, z15.b\n"
       "bgt 9b\n"
       "10:"  // K oddments: End
-      ".inst 0xa040c360  // ld1w { z0.s-z3.s }, pn8.b/Z, [x27]\n"
+      ".inst 0xa140c363  // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27]\n"
       "addvl x27, x27, #4\n"
-      ".inst 0xc0912400  // addva za0.s, p1/M, p1/M, z0.s\n"
-      ".inst 0xc0912421  // addva za1.s, p1/M, p1/M, z1.s\n"
-      ".inst 0xc0912442  // addva za2.s, p1/M, p1/M, z2.s\n"
-      ".inst 0xc0912463  // addva za3.s, p1/M, p1/M, z3.s\n"
+      ".inst 0xc0912460  // addva za0.s, p1/M, p1/M, z3.s\n"
+      ".inst 0xc09124e1  // addva za1.s, p1/M, p1/M, z7.s\n"
+      ".inst 0xc0912562  // addva za2.s, p1/M, p1/M, z11.s\n"
+      ".inst 0xc09125e3  // addva za3.s, p1/M, p1/M, z15.s\n"
       "tbz x16, #1, 14f\n"
       "tbz x16, #0, 12f\n"
       "mov x12, #0x0\n"
       "cntw x20\n"
       "11:"  // Store to partial result buffer: Store and refill: Loop
-      ".inst 0xa040c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n"
-      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
-      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
-      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
-      ".inst 0xa041c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
-      ".inst 0xc0840481  // mova za1h.s[x12], { z4.s-z7.s }\n"
-      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
-      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa040c1ec  // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
       ".inst 0xa042c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
       ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
-      ".inst 0xa043c1f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c1e8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa060c1d0  // st1w { z16.s-z19.s }, pn8.b, [x14]\n"
+      ".inst 0xa060c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
       "addvl x15, x15, #16\n"
-      ".inst 0xa061c1c8  // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n"
-      ".inst 0xa062c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa061c1dc  // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n"
+      ".inst 0xa062c1d4  // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 11b\n"
       "b 30f\n"
@@ -260,16 +259,16 @@
       "mov x12, #0x0\n"
       "cntw x20\n"
       "13:"  // Store to partial result buffer: Store only: Loop
-      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
       ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
-      ".inst 0xa060c1d0  // st1w { z16.s-z19.s }, pn8.b, [x14]\n"
-      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
-      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa060c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14]\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
       ".inst 0xa061c1cc  // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
-      ".inst 0xa062c1d4  // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n"
-      ".inst 0xa063c1d8  // st1w { z24.s-z27.s }, pn8.b, [x14, #0xc, MUL VL]\n"
+      ".inst 0xa062c1c4  // st1w { z4.s-z7.s }, pn8.b, [x14, #0x8, MUL VL]\n"
+      ".inst 0xa063c1c0  // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n"
       "addvl x14, x14, #16\n"
       "blt 13b\n"
       "b 30f\n"
@@ -277,22 +276,22 @@
       "ldr x26, [%x[args], %[offsetof_C]]\n"
       "add x26, x26, x10\n"  // C += n
       "sub x25, x13, x11\n"
-      "ld1rw { z8.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
       "ldr x24, [%x[args], %[offsetof_ldcb]]\n"
       "madd x26, x11, x24, x26\n"  // C += m * ldc
-      "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
-      "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
-      "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
-      "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+      "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
       "tbz x16, #2, 15f\n"
       "ldr w21, [%x[args], %[offsetof_n_0]]\n"
       "add x21, x21, x10\n"
       "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
       "add x20, x20, x21, LSL #2\n"
-      "ld1w { z8.s }, p0/Z, [x20]\n"
+      "ld1w { z2.s }, p0/Z, [x20]\n"
       "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
       "add x20, x20, x21, LSL #2\n"
-      "ld1w { z7.s }, p0/Z, [x20]\n"
+      "ld1w { z1.s }, p0/Z, [x20]\n"
       "15:"  // Store to output array: Load per-channel parameters: End
       "cntw x23\n"
       "whilelt p0.s, x10, x9\n"
@@ -303,30 +302,30 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 17f\n"
       "16:"  // Store to output array: Accumulator row 0 loop
-      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
-      ".inst 0xc1a8ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc1a2ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
       "add x12, x12, #0x4\n"
-      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
-      ".inst 0xc1a4ccac  // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
-      "st1b { z12.s }, p0, [x26]\n"
+      ".inst 0xc1a0ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
+      ".inst 0xc1b4ceb0  // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      "st1b { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z13.s }, p0, [x26]\n"
+      "st1b { z17.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z14.s }, p0, [x26]\n"
+      "st1b { z18.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z15.s }, p0, [x26]\n"
+      "st1b { z19.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 16b\n"
       "17:"  // Store to output array: Accumulator row 0 oddments
       "cbz x20, 18f\n"
       ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
-      ".inst 0xc1a8ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+      ".inst 0xc1a2ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1a7aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
-      ".inst 0xc1a6ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
-      ".inst 0xc1a4ccb0  // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
+      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+      ".inst 0xc1a0ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n"
+      ".inst 0xc1b4ceb0  // sclamp { z16.s-z19.s }, z21.s, z20.s\n"
       "st1b { z16.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 18f\n"
@@ -347,38 +346,38 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 20f\n"
       "19:"  // Store to output array: Accumulator row 1 loop
-      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
-      ".inst 0xc1a8ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
       "add x12, x12, #0x4\n"
-      ".inst 0xc1a7aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+      ".inst 0xc1a1aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xc1a6ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
-      ".inst 0xc1a4ccb0  // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
-      "st1b { z16.s }, p0, [x26]\n"
+      ".inst 0xc1a0ab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1b4cea4  // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
+      "st1b { z4.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z17.s }, p0, [x26]\n"
+      "st1b { z5.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z18.s }, p0, [x26]\n"
+      "st1b { z6.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z19.s }, p0, [x26]\n"
+      "st1b { z7.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 19b\n"
       "20:"  // Store to output array: Accumulator row 1 oddments
       "cbz x20, 21f\n"
-      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
-      ".inst 0xc1a8ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1a7aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
-      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
-      ".inst 0xc1a4ccbc  // sclamp { z28.s-z31.s }, z5.s, z4.s\n"
-      "st1b { z28.s }, p0, [x26]\n"
+      ".inst 0xc1a1aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+      ".inst 0xc1a0ab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1b4cea4  // sclamp { z4.s-z7.s }, z21.s, z20.s\n"
+      "st1b { z4.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 21f\n"
       "subs x20, x20, #0x1\n"
-      "st1b { z29.s }, p0, [x26]\n"
+      "st1b { z5.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 21f\n"
-      "st1b { z30.s }, p0, [x26]\n"
+      "st1b { z6.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "21:"  // Store to output array: Accumulator row 1 oddments: End
       "subs x25, x25, x22\n"
@@ -391,30 +390,30 @@
       "and x20, x22, #0x3\n"
       "cbz x21, 23f\n"
       "22:"  // Store to output array: Accumulator row 2 loop
-      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
-      ".inst 0xc1a8ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc1a2ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n"
       "add x12, x12, #0x4\n"
-      ".inst 0xc1a7aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+      ".inst 0xc1a1aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xc1a6ab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
-      ".inst 0xc1a4ccb8  // sclamp { z24.s-z27.s }, z5.s, z4.s\n"
-      "st1b { z24.s }, p0, [x26]\n"
+      ".inst 0xc1a0ab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n"
+      ".inst 0xc1b4cea8  // sclamp { z8.s-z11.s }, z21.s, z20.s\n"
+      "st1b { z8.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z25.s }, p0, [x26]\n"
+      "st1b { z9.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z26.s }, p0, [x26]\n"
+      "st1b { z10.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z27.s }, p0, [x26]\n"
+      "st1b { z11.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 22b\n"
       "23:"  // Store to output array: Accumulator row 2 oddments
       "cbz x20, 24f\n"
       ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
-      ".inst 0xc1a8ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+      ".inst 0xc1a2ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
-      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
-      ".inst 0xc1a4ccac  // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
+      ".inst 0xc1a1aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+      ".inst 0xc1a0ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+      ".inst 0xc1b4ceac  // sclamp { z12.s-z15.s }, z21.s, z20.s\n"
       "st1b { z12.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 24f\n"
@@ -435,52 +434,52 @@
       "and x20, x20, #0x3\n"
       "cbz x21, 26f\n"
       "25:"  // Store to output array: Accumulator row 3 loop
-      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
-      ".inst 0xc1a8ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a2ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
       "add x12, x12, #0x4\n"
-      ".inst 0xc1a7aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+      ".inst 0xc1a1aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
       "cmp x12, x21, LSL #2\n"
-      ".inst 0xc1a6ab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
-      ".inst 0xc1a4ccb4  // sclamp { z20.s-z23.s }, z5.s, z4.s\n"
-      "st1b { z20.s }, p0, [x26]\n"
+      ".inst 0xc1a0ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+      ".inst 0xc1b4cebc  // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1b { z28.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z21.s }, p0, [x26]\n"
+      "st1b { z29.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z22.s }, p0, [x26]\n"
+      "st1b { z30.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
-      "st1b { z23.s }, p0, [x26]\n"
+      "st1b { z31.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "blt 25b\n"
       "26:"  // Store to output array: Accumulator row 3 oddments
       "cbz x20, 27f\n"
-      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
-      ".inst 0xc1a8ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xc1a2ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
       "subs x20, x20, #0x1\n"
-      ".inst 0xc1a7aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z7.s\n"
-      ".inst 0xc1a6ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
-      ".inst 0xc1a4cca0  // sclamp { z0.s-z3.s }, z5.s, z4.s\n"
-      "st1b { z0.s }, p0, [x26]\n"
+      ".inst 0xc1a1aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+      ".inst 0xc1a0ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+      ".inst 0xc1b4cebc  // sclamp { z28.s-z31.s }, z21.s, z20.s\n"
+      "st1b { z28.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 27f\n"
       "subs x20, x20, #0x1\n"
-      "st1b { z1.s }, p0, [x26]\n"
+      "st1b { z29.s }, p0, [x26]\n"
       "add x26, x26, x24\n"
       "beq 27f\n"
-      "st1b { z2.s }, p0, [x26]\n"
+      "st1b { z30.s }, p0, [x26]\n"
       "27:"  // Store to output array: Accumulator row 3 oddments: End
       "28:"  // Store to output array: End
       "tbz x16, #0, 30f\n"
       "mov x12, #0x0\n"
       "cntw x20\n"
       "29:"  // Store to output array: Refill accumulators: Loop
-      ".inst 0xa040c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
-      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa040c1fc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
       ".inst 0xa041c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n"
       ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
       ".inst 0xa042c1f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n"
       ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
-      ".inst 0xa043c1e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
-      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa043c1e0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n"
+      ".inst 0xc0840403  // mova za3h.s[x12], { z0.s-z3.s }\n"
       "add x12, x12, #0x4\n"
       "cmp x12, x20\n"
       "addvl x15, x15, #16\n"
@@ -504,4 +503,3 @@
 }  // namespace arm_gemm
 
 #endif  // ARM_COMPUTE_ENABLE_SME2
-#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp
index e07fa54..1ce169d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp
index 13f2e48..9136e32 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp
@@ -157,16 +157,16 @@
       "b 6f\n"
       "4:"  // Height 1: no bias
       "tbz %x[flags], #0, 5f\n"
-      "ld1w { z9.s }, p4/Z, [x13]\n"
-      "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x13]\n"
+      "ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "zip1 z8.d, z16.d, z12.d\n"
+      "zip2 z12.d, z16.d, z12.d\n"
+      "ld1w { z17.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
       "b 6f\n"
@@ -184,11 +184,11 @@
       "7:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 8f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 9f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -200,43 +200,43 @@
       "ble 11f\n"
       "10:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "ld1rqh { z20.h }, p0/Z, [x26]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x9]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n"
+      ".inst 0x6471e688  // bfmmla z8.s, z20.h, z17.h\n"
+      ".inst 0x6470e68c  // bfmmla z12.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
+      ".inst 0x6471e689  // bfmmla z9.s, z20.h, z17.h\n"
+      ".inst 0x6470e68d  // bfmmla z13.s, z20.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6470e68a  // bfmmla z10.s, z20.h, z16.h\n"
+      ".inst 0x6471e68e  // bfmmla z14.s, z20.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
       "sub x27, x27, #0x8\n"
       "cmp x27, #0x8\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6471e68b  // bfmmla z11.s, z20.h, z17.h\n"
+      ".inst 0x6470e68f  // bfmmla z15.s, z20.h, z16.h\n"
       "add x26, x26, #0x10\n"
       "addvl x12, x12, #4\n"
       "addvl x11, x11, #4\n"
@@ -246,46 +246,46 @@
       "11:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x9]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
       "addvl x12, x12, #2\n"
       "addvl x11, x11, #2\n"
       "addvl x10, x10, #2\n"
       "addvl x9, x9, #2\n"
       "ble 12f\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6471e428  // bfmmla z8.s, z1.h, z17.h\n"
+      ".inst 0x6470e42c  // bfmmla z12.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6471e429  // bfmmla z9.s, z1.h, z17.h\n"
+      ".inst 0x6470e42d  // bfmmla z13.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e42a  // bfmmla z10.s, z1.h, z17.h\n"
+      ".inst 0x6470e42e  // bfmmla z14.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x9]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6471e42b  // bfmmla z11.s, z1.h, z17.h\n"
+      ".inst 0x6470e42f  // bfmmla z15.s, z1.h, z16.h\n"
       "addvl x12, x12, #2\n"
       "addvl x11, x11, #2\n"
       "addvl x10, x10, #2\n"
@@ -301,17 +301,17 @@
       "uzp1 z11.d, z11.d, z15.d\n"
       "tbz %x[flags], #1, 13f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z21.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z21.s\n"
+      "fmin z9.s, p5/M, z9.s, z21.s\n"
+      "fmin z10.s, p5/M, z10.s, z21.s\n"
+      "fmin z11.s, p5/M, z11.s, z21.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
       "13:"  // Height 1: No activation
       "st1w { z8.s }, p4, [x13]\n"
       "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
@@ -376,21 +376,21 @@
       "18:"  // Height 2: no bias
       "tbz %x[flags], #0, 19f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x13]\n"
-      "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "add x20, x13, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x13]\n"
+      "ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
       "b 20f\n"
@@ -408,12 +408,12 @@
       "21:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 22f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 23f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -421,50 +421,50 @@
       "b 23f\n"
       "22:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
       "23:"  // Height 2: input setup done
       "cmp x27, #0x8\n"
       "ble 25f\n"
       "24:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "ld1rqh { z20.h }, p0/Z, [x26]\n"
+      "ld1rqh { z19.h }, p0/Z, [x25]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x9]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n"
+      ".inst 0x6471e688  // bfmmla z8.s, z20.h, z17.h\n"
+      ".inst 0x6470e68c  // bfmmla z12.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
+      ".inst 0x6471e689  // bfmmla z9.s, z20.h, z17.h\n"
+      ".inst 0x6470e68d  // bfmmla z13.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e68a  // bfmmla z10.s, z20.h, z17.h\n"
+      ".inst 0x6470e68e  // bfmmla z14.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
       "sub x27, x27, #0x8\n"
       "cmp x27, #0x8\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6471e68b  // bfmmla z11.s, z20.h, z17.h\n"
+      ".inst 0x6470e68f  // bfmmla z15.s, z20.h, z16.h\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "addvl x12, x12, #4\n"
@@ -475,47 +475,47 @@
       "25:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "ld1rqh { z19.h }, p0/Z, [x25]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x9]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
       "addvl x12, x12, #2\n"
       "addvl x11, x11, #2\n"
       "addvl x10, x10, #2\n"
       "addvl x9, x9, #2\n"
       "ble 26f\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6471e428  // bfmmla z8.s, z1.h, z17.h\n"
+      ".inst 0x6470e42c  // bfmmla z12.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x11]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6471e429  // bfmmla z9.s, z1.h, z17.h\n"
+      ".inst 0x6470e42d  // bfmmla z13.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e42a  // bfmmla z10.s, z1.h, z17.h\n"
+      ".inst 0x6470e42e  // bfmmla z14.s, z1.h, z16.h\n"
+      "ld1h { z22.h }, p5/Z, [x9]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6476e42b  // bfmmla z11.s, z1.h, z22.h\n"
+      ".inst 0x6470e42f  // bfmmla z15.s, z1.h, z16.h\n"
       "addvl x12, x12, #2\n"
       "addvl x11, x11, #2\n"
       "addvl x10, x10, #2\n"
@@ -537,25 +537,25 @@
       "uzp2 z11.d, z11.d, z15.d\n"
       "tbz %x[flags], #1, 27f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z7.s, p5/M, z7.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z7.s, p5/M, z7.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z17.s\n"
+      "fmin z12.s, p5/M, z12.s, z17.s\n"
+      "fmin z13.s, p5/M, z13.s, z17.s\n"
+      "fmin z14.s, p5/M, z14.s, z17.s\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmax z7.s, p5/M, z7.s, z16.s\n"
+      "fmax z12.s, p5/M, z12.s, z16.s\n"
+      "fmax z13.s, p5/M, z13.s, z16.s\n"
+      "fmax z14.s, p5/M, z14.s, z16.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
       "27:"  // Height 2: No activation
       "st1w { z7.s }, p4, [x13]\n"
       "st1w { z12.s }, p3, [x13, #1, MUL VL]\n"
@@ -632,28 +632,28 @@
       "32:"  // Height 3: no bias
       "tbz %x[flags], #0, 33f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x13]\n"
-      "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "add x21, x13, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x13]\n"
+      "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x24]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x20]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
       "zip1 z17.d, z18.d, z21.d\n"
@@ -685,13 +685,13 @@
       "35:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 36f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 37f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -700,145 +700,145 @@
       "b 37f\n"
       "36:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "37:"  // Height 3: input setup done
       "cmp x27, #0x8\n"
       "ble 39f\n"
       "38:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "ld1rqh { z30.h }, p0/Z, [x26]\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
+      "ld1rqh { z28.h }, p0/Z, [x24]\n"
+      "trn1 z27.d, z30.d, z24.d\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "trn1 z26.d, z28.d, z29.d\n"
+      "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6479e768  // bfmmla z8.s, z27.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e76c  // bfmmla z12.s, z27.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x11]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6479e769  // bfmmla z9.s, z27.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "trn2 z28.d, z28.d, z29.d\n"
+      ".inst 0x6478e76d  // bfmmla z13.s, z27.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      ".inst 0x6479e76a  // bfmmla z10.s, z27.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9]\n"
       "cmp x27, #0x8\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6478e76e  // bfmmla z14.s, z27.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n"
+      ".inst 0x6479e76b  // bfmmla z11.s, z27.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x12, #2, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
+      ".inst 0x6478e76f  // bfmmla z15.s, z27.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x12, #3, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
+      ".inst 0x6479e7c8  // bfmmla z8.s, z30.h, z25.h\n"
+      ".inst 0x6479e790  // bfmmla z16.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x11, #2, MUL VL]\n"
       "addvl x12, x12, #4\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n"
+      ".inst 0x6478e7cc  // bfmmla z12.s, z30.h, z24.h\n"
+      ".inst 0x6478e794  // bfmmla z20.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x11, #3, MUL VL]\n"
       "addvl x11, x11, #4\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e7c9  // bfmmla z9.s, z30.h, z25.h\n"
+      ".inst 0x6479e791  // bfmmla z17.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6478e7cd  // bfmmla z13.s, z30.h, z24.h\n"
+      ".inst 0x6478e795  // bfmmla z21.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6479e7ca  // bfmmla z10.s, z30.h, z25.h\n"
+      ".inst 0x6479e792  // bfmmla z18.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6478e7ce  // bfmmla z14.s, z30.h, z24.h\n"
+      ".inst 0x6478e796  // bfmmla z22.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      ".inst 0x6479e7cb  // bfmmla z11.s, z30.h, z25.h\n"
+      ".inst 0x6479e793  // bfmmla z19.s, z28.h, z25.h\n"
+      ".inst 0x6478e7cf  // bfmmla z15.s, z30.h, z24.h\n"
+      ".inst 0x6478e797  // bfmmla z23.s, z28.h, z24.h\n"
       "bgt 38b\n"
       "39:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
       "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "trn1 z27.d, z1.d, z24.d\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "trn1 z26.d, z3.d, z28.d\n"
+      "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6479e768  // bfmmla z8.s, z27.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e76c  // bfmmla z12.s, z27.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x11]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6479e769  // bfmmla z9.s, z27.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x4\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      ".inst 0x6478e76d  // bfmmla z13.s, z27.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "trn2 z3.d, z3.d, z28.d\n"
+      ".inst 0x6479e76a  // bfmmla z10.s, z27.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9]\n"
       "addvl x12, x12, #2\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6478e76e  // bfmmla z14.s, z27.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
       "addvl x11, x11, #2\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
+      ".inst 0x6479e76b  // bfmmla z11.s, z27.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
       "addvl x10, x10, #2\n"
       "addvl x9, x9, #2\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      ".inst 0x6478e76f  // bfmmla z15.s, z27.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
       "ble 40f\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6479e428  // bfmmla z8.s, z1.h, z25.h\n"
+      ".inst 0x6479e470  // bfmmla z16.s, z3.h, z25.h\n"
+      ".inst 0x6478e42c  // bfmmla z12.s, z1.h, z24.h\n"
+      ".inst 0x6478e474  // bfmmla z20.s, z3.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x11]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6479e429  // bfmmla z9.s, z1.h, z25.h\n"
+      ".inst 0x6479e471  // bfmmla z17.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
       "addvl x12, x12, #2\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6478e42d  // bfmmla z13.s, z1.h, z24.h\n"
+      ".inst 0x6478e475  // bfmmla z21.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
       "addvl x11, x11, #2\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      ".inst 0x6479e42a  // bfmmla z10.s, z1.h, z25.h\n"
+      ".inst 0x6479e472  // bfmmla z18.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9]\n"
       "addvl x10, x10, #2\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6478e42e  // bfmmla z14.s, z1.h, z24.h\n"
+      ".inst 0x6478e476  // bfmmla z22.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
       "addvl x9, x9, #2\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      ".inst 0x6479e42b  // bfmmla z11.s, z1.h, z25.h\n"
+      ".inst 0x6479e473  // bfmmla z19.s, z3.h, z25.h\n"
+      ".inst 0x6478e42f  // bfmmla z15.s, z1.h, z24.h\n"
+      ".inst 0x6478e477  // bfmmla z23.s, z3.h, z24.h\n"
       "40:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -861,33 +861,33 @@
       "uzp1 z19.d, z19.d, z23.d\n"
       "tbz %x[flags], #1, 41f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z25.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z7.s, p5/M, z7.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z7.s, p5/M, z7.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z25.s\n"
+      "fmin z12.s, p5/M, z12.s, z25.s\n"
+      "fmin z13.s, p5/M, z13.s, z25.s\n"
+      "fmin z14.s, p5/M, z14.s, z25.s\n"
+      "fmin z8.s, p5/M, z8.s, z25.s\n"
+      "fmin z9.s, p5/M, z9.s, z25.s\n"
+      "fmin z10.s, p5/M, z10.s, z25.s\n"
+      "fmin z11.s, p5/M, z11.s, z25.s\n"
+      "fmin z16.s, p5/M, z16.s, z25.s\n"
+      "fmin z17.s, p5/M, z17.s, z25.s\n"
+      "fmin z18.s, p5/M, z18.s, z25.s\n"
+      "fmin z19.s, p5/M, z19.s, z25.s\n"
+      "fmax z7.s, p5/M, z7.s, z24.s\n"
+      "fmax z12.s, p5/M, z12.s, z24.s\n"
+      "fmax z13.s, p5/M, z13.s, z24.s\n"
+      "fmax z14.s, p5/M, z14.s, z24.s\n"
+      "fmax z8.s, p5/M, z8.s, z24.s\n"
+      "fmax z9.s, p5/M, z9.s, z24.s\n"
+      "fmax z10.s, p5/M, z10.s, z24.s\n"
+      "fmax z11.s, p5/M, z11.s, z24.s\n"
+      "fmax z16.s, p5/M, z16.s, z24.s\n"
+      "fmax z17.s, p5/M, z17.s, z24.s\n"
+      "fmax z18.s, p5/M, z18.s, z24.s\n"
+      "fmax z19.s, p5/M, z19.s, z24.s\n"
       "41:"  // Height 3: No activation
       "st1w { z7.s }, p4, [x13]\n"
       "st1w { z12.s }, p3, [x13, #1, MUL VL]\n"
@@ -968,37 +968,37 @@
       "46:"  // Height 4: no bias
       "tbz %x[flags], #0, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x13]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "add x22, x13, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x13]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x24]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x21]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
+      "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z17.d, z18.d, z21.d\n"
       "zip2 z21.d, z18.d, z21.d\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z18.d, z19.d, z22.d\n"
       "zip2 z22.d, z19.d, z22.d\n"
       "zip1 z19.d, z24.d, z23.d\n"
@@ -1026,14 +1026,14 @@
       "49:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 50f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 51f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1043,149 +1043,149 @@
       "b 51f\n"
       "50:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "51:"  // Height 4: input setup done
       "cmp x27, #0x8\n"
       "ble 53f\n"
       "52:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "ld1rqh { z4.h }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "ld1rqh { z30.h }, p0/Z, [x26]\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
+      "trn1 z29.d, z30.d, z24.d\n"
+      "ld1rqh { z28.h }, p0/Z, [x24]\n"
+      "ld1rqh { z27.h }, p0/Z, [x23]\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "trn1 z26.d, z28.d, z27.d\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6479e7a8  // bfmmla z8.s, z29.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e7ac  // bfmmla z12.s, z29.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x11]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6479e7a9  // bfmmla z9.s, z29.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "trn2 z28.d, z28.d, z27.d\n"
+      ".inst 0x6478e7ad  // bfmmla z13.s, z29.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      ".inst 0x6479e7aa  // bfmmla z10.s, z29.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9]\n"
       "cmp x27, #0x8\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6478e7ae  // bfmmla z14.s, z29.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n"
+      ".inst 0x6479e7ab  // bfmmla z11.s, z29.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x12, #2, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
+      ".inst 0x6478e7af  // bfmmla z15.s, z29.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x12, #3, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
+      ".inst 0x6479e7c8  // bfmmla z8.s, z30.h, z25.h\n"
+      ".inst 0x6479e790  // bfmmla z16.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x11, #2, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n"
+      ".inst 0x6478e7cc  // bfmmla z12.s, z30.h, z24.h\n"
+      ".inst 0x6478e794  // bfmmla z20.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x11, #3, MUL VL]\n"
       "addvl x12, x12, #4\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6479e7c9  // bfmmla z9.s, z30.h, z25.h\n"
+      ".inst 0x6479e791  // bfmmla z17.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
       "addvl x11, x11, #4\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6478e7cd  // bfmmla z13.s, z30.h, z24.h\n"
+      ".inst 0x6478e795  // bfmmla z21.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6479e7ca  // bfmmla z10.s, z30.h, z25.h\n"
+      ".inst 0x6479e792  // bfmmla z18.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6478e7ce  // bfmmla z14.s, z30.h, z24.h\n"
+      ".inst 0x6478e796  // bfmmla z22.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      ".inst 0x6479e7cb  // bfmmla z11.s, z30.h, z25.h\n"
+      ".inst 0x6479e793  // bfmmla z19.s, z28.h, z25.h\n"
+      ".inst 0x6478e7cf  // bfmmla z15.s, z30.h, z24.h\n"
+      ".inst 0x6478e797  // bfmmla z23.s, z28.h, z24.h\n"
       "bgt 52b\n"
       "53:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
+      "trn1 z28.d, z1.d, z24.d\n"
       "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "ld1rqh { z4.h }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1rqh { z27.h }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "trn1 z26.d, z3.d, z27.d\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6479e788  // bfmmla z8.s, z28.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e78c  // bfmmla z12.s, z28.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x11]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6479e789  // bfmmla z9.s, z28.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x4\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      ".inst 0x6478e78d  // bfmmla z13.s, z28.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "trn2 z3.d, z3.d, z27.d\n"
+      ".inst 0x6479e78a  // bfmmla z10.s, z28.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9]\n"
       "addvl x12, x12, #2\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6478e78e  // bfmmla z14.s, z28.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
       "addvl x11, x11, #2\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
+      ".inst 0x6479e78b  // bfmmla z11.s, z28.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
       "addvl x10, x10, #2\n"
       "addvl x9, x9, #2\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      ".inst 0x6478e78f  // bfmmla z15.s, z28.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
       "ble 54f\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6479e428  // bfmmla z8.s, z1.h, z25.h\n"
+      ".inst 0x6479e470  // bfmmla z16.s, z3.h, z25.h\n"
+      ".inst 0x6478e42c  // bfmmla z12.s, z1.h, z24.h\n"
+      ".inst 0x6478e474  // bfmmla z20.s, z3.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x11]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6479e429  // bfmmla z9.s, z1.h, z25.h\n"
+      ".inst 0x6479e471  // bfmmla z17.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
       "addvl x12, x12, #2\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6478e42d  // bfmmla z13.s, z1.h, z24.h\n"
+      ".inst 0x6478e475  // bfmmla z21.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
       "addvl x11, x11, #2\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      ".inst 0x6479e42a  // bfmmla z10.s, z1.h, z25.h\n"
+      ".inst 0x6479e472  // bfmmla z18.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x9]\n"
       "addvl x10, x10, #2\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6478e42e  // bfmmla z14.s, z1.h, z24.h\n"
+      ".inst 0x6478e476  // bfmmla z22.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
       "addvl x9, x9, #2\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      ".inst 0x6479e42b  // bfmmla z11.s, z1.h, z25.h\n"
+      ".inst 0x6479e473  // bfmmla z19.s, z3.h, z25.h\n"
+      ".inst 0x6478e42f  // bfmmla z15.s, z1.h, z24.h\n"
+      ".inst 0x6478e477  // bfmmla z23.s, z3.h, z24.h\n"
       "54:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1213,41 +1213,41 @@
       "uzp2 z19.d, z19.d, z23.d\n"
       "tbz %x[flags], #1, 55f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z7.s, p5/M, z7.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z1.s\n"
-      "fmin z22.s, p5/M, z22.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z7.s, p5/M, z7.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z0.s\n"
-      "fmax z22.s, p5/M, z22.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "ld1rw { z23.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z24.s\n"
+      "fmin z12.s, p5/M, z12.s, z24.s\n"
+      "fmin z13.s, p5/M, z13.s, z24.s\n"
+      "fmin z14.s, p5/M, z14.s, z24.s\n"
+      "fmin z8.s, p5/M, z8.s, z24.s\n"
+      "fmin z9.s, p5/M, z9.s, z24.s\n"
+      "fmin z10.s, p5/M, z10.s, z24.s\n"
+      "fmin z11.s, p5/M, z11.s, z24.s\n"
+      "fmin z15.s, p5/M, z15.s, z24.s\n"
+      "fmin z20.s, p5/M, z20.s, z24.s\n"
+      "fmin z21.s, p5/M, z21.s, z24.s\n"
+      "fmin z22.s, p5/M, z22.s, z24.s\n"
+      "fmin z16.s, p5/M, z16.s, z24.s\n"
+      "fmin z17.s, p5/M, z17.s, z24.s\n"
+      "fmin z18.s, p5/M, z18.s, z24.s\n"
+      "fmin z19.s, p5/M, z19.s, z24.s\n"
+      "fmax z7.s, p5/M, z7.s, z23.s\n"
+      "fmax z12.s, p5/M, z12.s, z23.s\n"
+      "fmax z13.s, p5/M, z13.s, z23.s\n"
+      "fmax z14.s, p5/M, z14.s, z23.s\n"
+      "fmax z8.s, p5/M, z8.s, z23.s\n"
+      "fmax z9.s, p5/M, z9.s, z23.s\n"
+      "fmax z10.s, p5/M, z10.s, z23.s\n"
+      "fmax z11.s, p5/M, z11.s, z23.s\n"
+      "fmax z15.s, p5/M, z15.s, z23.s\n"
+      "fmax z20.s, p5/M, z20.s, z23.s\n"
+      "fmax z21.s, p5/M, z21.s, z23.s\n"
+      "fmax z22.s, p5/M, z22.s, z23.s\n"
+      "fmax z16.s, p5/M, z16.s, z23.s\n"
+      "fmax z17.s, p5/M, z17.s, z23.s\n"
+      "fmax z18.s, p5/M, z18.s, z23.s\n"
+      "fmax z19.s, p5/M, z19.s, z23.s\n"
       "55:"  // Height 4: No activation
       "st1w { z7.s }, p4, [x13]\n"
       "st1w { z12.s }, p3, [x13, #1, MUL VL]\n"
@@ -1340,54 +1340,54 @@
       "60:"  // Height 5: no bias
       "tbz %x[flags], #0, 61f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x13]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x23, x13, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x13]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x24]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip1 z17.d, z18.d, z21.d\n"
       "zip2 z21.d, z18.d, z21.d\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z25.s }, p4/Z, [x22]\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x20]\n"
       "zip1 z18.d, z19.d, z22.d\n"
       "zip2 z22.d, z19.d, z22.d\n"
-      "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z19.d, z24.d, z23.d\n"
       "zip2 z23.d, z24.d, z23.d\n"
-      "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z24.d, z25.d, z28.d\n"
       "zip2 z28.d, z25.d, z28.d\n"
       "zip1 z25.d, z26.d, z29.d\n"
       "zip2 z29.d, z26.d, z29.d\n"
       "zip1 z26.d, z27.d, z30.d\n"
       "zip2 z30.d, z27.d, z30.d\n"
-      "zip1 z27.d, z6.d, z31.d\n"
-      "zip2 z31.d, z6.d, z31.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
       "b 62f\n"
       "61:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -1419,15 +1419,15 @@
       "63:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 64f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 65f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1438,189 +1438,189 @@
       "b 65f\n"
       "64:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "65:"  // Height 5: input setup done
       "cmp x27, #0x8\n"
       "ble 67f\n"
       "66:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "ld1rqh { z4.h }, p0/Z, [x23]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1rqh { z5.h }, p0/Z, [x22]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6467e498  // bfmmla z24.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
+      "ld1rqh { z6.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z7.h }, p0/Z, [x24]\n"
+      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "trn1 z5.d, z6.d, z1.d\n"
+      "trn2 z6.d, z6.d, z1.d\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "trn1 z3.d, z7.d, z2.d\n"
+      "trn2 z7.d, z7.d, z2.d\n"
+      "ld1h { z1.h }, p5/Z, [x12]\n"
+      "trn1 z2.d, z4.d, z0.d\n"
+      "trn2 z4.d, z4.d, z0.d\n"
+      "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6461e4a8  // bfmmla z8.s, z5.h, z1.h\n"
+      ".inst 0x6461e470  // bfmmla z16.s, z3.h, z1.h\n"
+      ".inst 0x6461e458  // bfmmla z24.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x11]\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      ".inst 0x6460e4ac  // bfmmla z12.s, z5.h, z0.h\n"
+      ".inst 0x6460e474  // bfmmla z20.s, z3.h, z0.h\n"
       "cmp x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6466e49c  // bfmmla z28.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6460e45c  // bfmmla z28.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6461e4a9  // bfmmla z9.s, z5.h, z1.h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      ".inst 0x6467e499  // bfmmla z25.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
+      ".inst 0x6461e471  // bfmmla z17.s, z3.h, z1.h\n"
+      ".inst 0x6461e459  // bfmmla z25.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6460e4ad  // bfmmla z13.s, z5.h, z0.h\n"
+      ".inst 0x6460e475  // bfmmla z21.s, z3.h, z0.h\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x6466e49d  // bfmmla z29.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      ".inst 0x6467e49a  // bfmmla z26.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      ".inst 0x6466e49e  // bfmmla z30.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
-      ".inst 0x6467e49b  // bfmmla z27.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6466e49f  // bfmmla z31.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
+      ".inst 0x6460e45d  // bfmmla z29.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6461e4aa  // bfmmla z10.s, z5.h, z1.h\n"
+      ".inst 0x6461e472  // bfmmla z18.s, z3.h, z1.h\n"
+      ".inst 0x6461e45a  // bfmmla z26.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x9]\n"
+      ".inst 0x6460e4ae  // bfmmla z14.s, z5.h, z0.h\n"
+      ".inst 0x6460e476  // bfmmla z22.s, z3.h, z0.h\n"
+      ".inst 0x6460e45e  // bfmmla z30.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6461e4ab  // bfmmla z11.s, z5.h, z1.h\n"
+      ".inst 0x6461e473  // bfmmla z19.s, z3.h, z1.h\n"
+      ".inst 0x6461e45b  // bfmmla z27.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x12, #2, MUL VL]\n"
+      ".inst 0x6460e4af  // bfmmla z15.s, z5.h, z0.h\n"
+      ".inst 0x6460e477  // bfmmla z23.s, z3.h, z0.h\n"
+      ".inst 0x6460e45f  // bfmmla z31.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x12, #3, MUL VL]\n"
+      ".inst 0x6461e4c8  // bfmmla z8.s, z6.h, z1.h\n"
       "addvl x12, x12, #4\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b8  // bfmmla z24.s, z5.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bc  // bfmmla z28.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
+      ".inst 0x6461e4f0  // bfmmla z16.s, z7.h, z1.h\n"
+      ".inst 0x6461e498  // bfmmla z24.s, z4.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x11, #2, MUL VL]\n"
+      ".inst 0x6460e4cc  // bfmmla z12.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f4  // bfmmla z20.s, z7.h, z0.h\n"
+      ".inst 0x6460e49c  // bfmmla z28.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #3, MUL VL]\n"
+      ".inst 0x6461e4c9  // bfmmla z9.s, z6.h, z1.h\n"
       "addvl x11, x11, #4\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bd  // bfmmla z29.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
+      ".inst 0x6461e4f1  // bfmmla z17.s, z7.h, z1.h\n"
+      ".inst 0x6461e499  // bfmmla z25.s, z4.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460e4cd  // bfmmla z13.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f5  // bfmmla z21.s, z7.h, z0.h\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461e4ca  // bfmmla z10.s, z6.h, z1.h\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      ".inst 0x6467e4ba  // bfmmla z26.s, z5.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      ".inst 0x6466e4be  // bfmmla z30.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
+      ".inst 0x6461e4f2  // bfmmla z18.s, z7.h, z1.h\n"
+      ".inst 0x6461e49a  // bfmmla z26.s, z4.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6460e4ce  // bfmmla z14.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f6  // bfmmla z22.s, z7.h, z0.h\n"
+      ".inst 0x6460e49e  // bfmmla z30.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6461e4cb  // bfmmla z11.s, z6.h, z1.h\n"
       "addvl x9, x9, #4\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6467e4bb  // bfmmla z27.s, z5.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bf  // bfmmla z31.s, z5.h, z6.h\n"
+      ".inst 0x6461e4f3  // bfmmla z19.s, z7.h, z1.h\n"
+      ".inst 0x6461e49b  // bfmmla z27.s, z4.h, z1.h\n"
+      ".inst 0x6460e4cf  // bfmmla z15.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f7  // bfmmla z23.s, z7.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
       "bgt 66b\n"
       "67:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
+      "ld1rqh { z4.h }, p0/Z, [x25]\n"
       "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "ld1rqh { z4.h }, p0/Z, [x23]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "trn1 z7.d, z1.d, z4.d\n"
+      "trn2 z1.d, z1.d, z4.d\n"
       "ld1rqh { z5.h }, p0/Z, [x22]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6467e498  // bfmmla z24.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "ld1h { z2.h }, p5/Z, [x12]\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6462e4e8  // bfmmla z8.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d0  // bfmmla z16.s, z6.h, z2.h\n"
+      ".inst 0x6462e498  // bfmmla z24.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x11]\n"
       "subs x27, x27, #0x4\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d4  // bfmmla z20.s, z6.h, z0.h\n"
       "addvl x12, x12, #2\n"
-      ".inst 0x6466e49c  // bfmmla z28.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6460e49c  // bfmmla z28.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6462e4e9  // bfmmla z9.s, z7.h, z2.h\n"
       "addvl x11, x11, #2\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      ".inst 0x6467e499  // bfmmla z25.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6466e49d  // bfmmla z29.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      ".inst 0x6462e4d1  // bfmmla z17.s, z6.h, z2.h\n"
+      ".inst 0x6462e499  // bfmmla z25.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d5  // bfmmla z21.s, z6.h, z0.h\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e4ea  // bfmmla z10.s, z7.h, z2.h\n"
       "addvl x10, x10, #2\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      ".inst 0x6467e49a  // bfmmla z26.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      ".inst 0x6466e49e  // bfmmla z30.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6462e4d2  // bfmmla z18.s, z6.h, z2.h\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x9]\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d6  // bfmmla z22.s, z6.h, z0.h\n"
+      ".inst 0x6460e49e  // bfmmla z30.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6462e4eb  // bfmmla z11.s, z7.h, z2.h\n"
       "addvl x9, x9, #2\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
-      ".inst 0x6467e49b  // bfmmla z27.s, z4.h, z7.h\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6466e49f  // bfmmla z31.s, z4.h, z6.h\n"
+      ".inst 0x6462e4d3  // bfmmla z19.s, z6.h, z2.h\n"
+      ".inst 0x6462e49b  // bfmmla z27.s, z4.h, z2.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d7  // bfmmla z23.s, z6.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
       "ble 68f\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b8  // bfmmla z24.s, z5.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
+      "ld1h { z2.h }, p5/Z, [x12]\n"
+      "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6462e428  // bfmmla z8.s, z1.h, z2.h\n"
+      ".inst 0x6462e470  // bfmmla z16.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b8  // bfmmla z24.s, z5.h, z2.h\n"
+      ".inst 0x6460e42c  // bfmmla z12.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x11]\n"
       "addvl x12, x12, #2\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bc  // bfmmla z28.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6460e474  // bfmmla z20.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bc  // bfmmla z28.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
       "addvl x11, x11, #2\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bd  // bfmmla z29.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e429  // bfmmla z9.s, z1.h, z2.h\n"
+      ".inst 0x6462e471  // bfmmla z17.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b9  // bfmmla z25.s, z5.h, z2.h\n"
+      ".inst 0x6460e42d  // bfmmla z13.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      ".inst 0x6460e475  // bfmmla z21.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
       "addvl x10, x10, #2\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      ".inst 0x6467e4ba  // bfmmla z26.s, z5.h, z7.h\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      ".inst 0x6466e4be  // bfmmla z30.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6462e42a  // bfmmla z10.s, z1.h, z2.h\n"
+      ".inst 0x6462e472  // bfmmla z18.s, z3.h, z2.h\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6460e42e  // bfmmla z14.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x9]\n"
+      ".inst 0x6460e476  // bfmmla z22.s, z3.h, z0.h\n"
+      ".inst 0x6460e4be  // bfmmla z30.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
       "addvl x9, x9, #2\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6467e4bb  // bfmmla z27.s, z5.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bf  // bfmmla z31.s, z5.h, z6.h\n"
+      ".inst 0x6462e42b  // bfmmla z11.s, z1.h, z2.h\n"
+      ".inst 0x6462e473  // bfmmla z19.s, z3.h, z2.h\n"
+      ".inst 0x6462e4bb  // bfmmla z27.s, z5.h, z2.h\n"
+      ".inst 0x6460e42f  // bfmmla z15.s, z1.h, z0.h\n"
+      ".inst 0x6460e477  // bfmmla z23.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
       "68:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1653,49 +1653,49 @@
       "uzp1 z27.d, z27.d, z31.d\n"
       "tbz %x[flags], #1, 69f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
       "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z7.s, p5/M, z7.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z1.s\n"
-      "fmin z22.s, p5/M, z22.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmin z24.s, p5/M, z24.s, z1.s\n"
-      "fmin z25.s, p5/M, z25.s, z1.s\n"
-      "fmin z26.s, p5/M, z26.s, z1.s\n"
-      "fmin z27.s, p5/M, z27.s, z1.s\n"
-      "fmax z7.s, p5/M, z7.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z0.s\n"
-      "fmax z22.s, p5/M, z22.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
-      "fmax z24.s, p5/M, z24.s, z0.s\n"
-      "fmax z25.s, p5/M, z25.s, z0.s\n"
-      "fmax z26.s, p5/M, z26.s, z0.s\n"
-      "fmax z27.s, p5/M, z27.s, z0.s\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z23.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z7.s, p5/M, z7.s, z23.s\n"
+      "fmax z12.s, p5/M, z12.s, z23.s\n"
+      "fmax z13.s, p5/M, z13.s, z23.s\n"
+      "fmax z14.s, p5/M, z14.s, z23.s\n"
+      "fmax z8.s, p5/M, z8.s, z23.s\n"
+      "fmax z9.s, p5/M, z9.s, z23.s\n"
+      "fmax z10.s, p5/M, z10.s, z23.s\n"
+      "fmax z11.s, p5/M, z11.s, z23.s\n"
+      "fmax z15.s, p5/M, z15.s, z23.s\n"
+      "fmax z20.s, p5/M, z20.s, z23.s\n"
+      "fmax z21.s, p5/M, z21.s, z23.s\n"
+      "fmax z22.s, p5/M, z22.s, z23.s\n"
+      "fmax z16.s, p5/M, z16.s, z23.s\n"
+      "fmax z17.s, p5/M, z17.s, z23.s\n"
+      "fmax z18.s, p5/M, z18.s, z23.s\n"
+      "fmax z19.s, p5/M, z19.s, z23.s\n"
+      "fmax z24.s, p5/M, z24.s, z23.s\n"
+      "fmax z25.s, p5/M, z25.s, z23.s\n"
+      "fmax z26.s, p5/M, z26.s, z23.s\n"
+      "fmax z27.s, p5/M, z27.s, z23.s\n"
       "69:"  // Height 5: No activation
       "st1w { z7.s }, p4, [x13]\n"
       "st1w { z12.s }, p3, [x13, #1, MUL VL]\n"
@@ -1795,59 +1795,59 @@
       "74:"  // Height 6: no bias
       "tbz %x[flags], #0, 75f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x13]\n"
+      "add x24, x13, x20, LSL #2\n"
       "add x23, x24, x20, LSL #2\n"
+      "ld1w { z17.s }, p4/Z, [x13]\n"
       "add x22, x23, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n"
       "add x21, x22, x20, LSL #2\n"
+      "ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x24]\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "zip1 z8.d, z17.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "zip2 z12.d, z17.d, z12.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z20.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip2 z14.d, z20.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
       "zip2 z15.d, z16.d, z15.d\n"
       "zip1 z16.d, z17.d, z20.d\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
       "zip2 z20.d, z17.d, z20.d\n"
       "zip1 z17.d, z18.d, z21.d\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z25.s }, p4/Z, [x22]\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x21]\n"
       "zip2 z21.d, z18.d, z21.d\n"
       "zip1 z18.d, z19.d, z22.d\n"
-      "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip2 z22.d, z19.d, z22.d\n"
       "zip1 z19.d, z24.d, z23.d\n"
-      "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p4/Z, [x21]\n"
+      "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
       "zip2 z23.d, z24.d, z23.d\n"
       "zip1 z24.d, z25.d, z28.d\n"
-      "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip2 z28.d, z25.d, z28.d\n"
       "zip1 z25.d, z26.d, z29.d\n"
-      "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip2 z29.d, z26.d, z29.d\n"
       "zip1 z26.d, z27.d, z30.d\n"
       "zip2 z30.d, z27.d, z30.d\n"
-      "zip1 z27.d, z6.d, z31.d\n"
-      "zip2 z31.d, z6.d, z31.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
       "b 76f\n"
       "75:"  // Height 6: no accumulate
       "mov z8.b, #0x0\n"
@@ -1879,16 +1879,16 @@
       "77:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 78f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 79f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1900,193 +1900,193 @@
       "b 79f\n"
       "78:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "79:"  // Height 6: input setup done
       "cmp x27, #0x8\n"
       "ble 81f\n"
       "80:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "ld1rqh { z4.h }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1rqh { z5.h }, p0/Z, [x22]\n"
-      "ld1rqh { z6.h }, p0/Z, [x21]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6467e498  // bfmmla z24.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
+      "ld1rqh { z7.h }, p0/Z, [x26]\n"
+      "ld1rqh { z0.h }, p0/Z, [x25]\n"
+      "trn1 z6.d, z7.d, z0.d\n"
+      "ld1rqh { z5.h }, p0/Z, [x24]\n"
+      "ld1rqh { z1.h }, p0/Z, [x23]\n"
+      "trn2 z7.d, z7.d, z0.d\n"
+      "trn1 z4.d, z5.d, z1.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x22]\n"
+      "ld1rqh { z0.h }, p0/Z, [x21]\n"
+      "trn2 z5.d, z5.d, z1.d\n"
+      "trn1 z2.d, z3.d, z0.d\n"
+      "trn2 z3.d, z3.d, z0.d\n"
+      "ld1h { z1.h }, p5/Z, [x12]\n"
+      "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6461e4c8  // bfmmla z8.s, z6.h, z1.h\n"
+      ".inst 0x6461e490  // bfmmla z16.s, z4.h, z1.h\n"
+      ".inst 0x6461e458  // bfmmla z24.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x11]\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      ".inst 0x6460e4cc  // bfmmla z12.s, z6.h, z0.h\n"
+      ".inst 0x6460e494  // bfmmla z20.s, z4.h, z0.h\n"
       "cmp x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6466e49c  // bfmmla z28.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6460e45c  // bfmmla z28.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6461e4c9  // bfmmla z9.s, z6.h, z1.h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      ".inst 0x6467e499  // bfmmla z25.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
+      ".inst 0x6461e491  // bfmmla z17.s, z4.h, z1.h\n"
+      ".inst 0x6461e459  // bfmmla z25.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6460e4cd  // bfmmla z13.s, z6.h, z0.h\n"
+      ".inst 0x6460e495  // bfmmla z21.s, z4.h, z0.h\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x6466e49d  // bfmmla z29.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      ".inst 0x6460e45d  // bfmmla z29.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6461e4ca  // bfmmla z10.s, z6.h, z1.h\n"
       "add x21, x21, #0x10\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      ".inst 0x6467e49a  // bfmmla z26.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      ".inst 0x6466e49e  // bfmmla z30.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
-      ".inst 0x6467e49b  // bfmmla z27.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6466e49f  // bfmmla z31.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
+      ".inst 0x6461e492  // bfmmla z18.s, z4.h, z1.h\n"
+      ".inst 0x6461e45a  // bfmmla z26.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x9]\n"
+      ".inst 0x6460e4ce  // bfmmla z14.s, z6.h, z0.h\n"
+      ".inst 0x6460e496  // bfmmla z22.s, z4.h, z0.h\n"
+      ".inst 0x6460e45e  // bfmmla z30.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6461e4cb  // bfmmla z11.s, z6.h, z1.h\n"
+      ".inst 0x6461e493  // bfmmla z19.s, z4.h, z1.h\n"
+      ".inst 0x6461e45b  // bfmmla z27.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x12, #2, MUL VL]\n"
+      ".inst 0x6460e4cf  // bfmmla z15.s, z6.h, z0.h\n"
+      ".inst 0x6460e497  // bfmmla z23.s, z4.h, z0.h\n"
+      ".inst 0x6460e45f  // bfmmla z31.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x12, #3, MUL VL]\n"
+      ".inst 0x6461e4e8  // bfmmla z8.s, z7.h, z1.h\n"
       "addvl x12, x12, #4\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b8  // bfmmla z24.s, z5.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bc  // bfmmla z28.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
+      ".inst 0x6461e4b0  // bfmmla z16.s, z5.h, z1.h\n"
+      ".inst 0x6461e478  // bfmmla z24.s, z3.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x11, #2, MUL VL]\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b4  // bfmmla z20.s, z5.h, z0.h\n"
+      ".inst 0x6460e47c  // bfmmla z28.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #3, MUL VL]\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
       "addvl x11, x11, #4\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bd  // bfmmla z29.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
+      ".inst 0x6461e4b1  // bfmmla z17.s, z5.h, z1.h\n"
+      ".inst 0x6461e479  // bfmmla z25.s, z3.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b5  // bfmmla z21.s, z5.h, z0.h\n"
+      ".inst 0x6460e47d  // bfmmla z29.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461e4ea  // bfmmla z10.s, z7.h, z1.h\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      ".inst 0x6467e4ba  // bfmmla z26.s, z5.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      ".inst 0x6466e4be  // bfmmla z30.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
+      ".inst 0x6461e4b2  // bfmmla z18.s, z5.h, z1.h\n"
+      ".inst 0x6461e47a  // bfmmla z26.s, z3.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b6  // bfmmla z22.s, z5.h, z0.h\n"
+      ".inst 0x6460e47e  // bfmmla z30.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
       "addvl x9, x9, #4\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6467e4bb  // bfmmla z27.s, z5.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bf  // bfmmla z31.s, z5.h, z6.h\n"
+      ".inst 0x6461e4b3  // bfmmla z19.s, z5.h, z1.h\n"
+      ".inst 0x6461e47b  // bfmmla z27.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b7  // bfmmla z23.s, z5.h, z0.h\n"
+      ".inst 0x6460e47f  // bfmmla z31.s, z3.h, z0.h\n"
       "bgt 80b\n"
       "81:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqh { z0.h }, p0/Z, [x25]\n"
+      "trn1 z7.d, z1.d, z0.d\n"
       "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "ld1rqh { z4.h }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
+      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z0.d\n"
+      "trn1 z6.d, z3.d, z2.d\n"
       "ld1rqh { z5.h }, p0/Z, [x22]\n"
-      "ld1rqh { z6.h }, p0/Z, [x21]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6467e498  // bfmmla z24.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
+      "ld1rqh { z0.h }, p0/Z, [x21]\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1h { z2.h }, p5/Z, [x12]\n"
+      "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6462e4e8  // bfmmla z8.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d0  // bfmmla z16.s, z6.h, z2.h\n"
+      ".inst 0x6462e498  // bfmmla z24.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x11]\n"
       "subs x27, x27, #0x4\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d4  // bfmmla z20.s, z6.h, z0.h\n"
       "addvl x12, x12, #2\n"
-      ".inst 0x6466e49c  // bfmmla z28.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6460e49c  // bfmmla z28.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6462e4e9  // bfmmla z9.s, z7.h, z2.h\n"
       "addvl x11, x11, #2\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      ".inst 0x6467e499  // bfmmla z25.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6466e49d  // bfmmla z29.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      ".inst 0x6462e4d1  // bfmmla z17.s, z6.h, z2.h\n"
+      ".inst 0x6462e499  // bfmmla z25.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d5  // bfmmla z21.s, z6.h, z0.h\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e4ea  // bfmmla z10.s, z7.h, z2.h\n"
       "addvl x10, x10, #2\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      ".inst 0x6467e49a  // bfmmla z26.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      ".inst 0x6466e49e  // bfmmla z30.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6462e4d2  // bfmmla z18.s, z6.h, z2.h\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x9]\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d6  // bfmmla z22.s, z6.h, z0.h\n"
+      ".inst 0x6460e49e  // bfmmla z30.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6462e4eb  // bfmmla z11.s, z7.h, z2.h\n"
       "addvl x9, x9, #2\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
-      ".inst 0x6467e49b  // bfmmla z27.s, z4.h, z7.h\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6466e49f  // bfmmla z31.s, z4.h, z6.h\n"
+      ".inst 0x6462e4d3  // bfmmla z19.s, z6.h, z2.h\n"
+      ".inst 0x6462e49b  // bfmmla z27.s, z4.h, z2.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d7  // bfmmla z23.s, z6.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
       "ble 82f\n"
-      "ld1h { z7.h }, p5/Z, [x12]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b8  // bfmmla z24.s, z5.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
+      "ld1h { z2.h }, p5/Z, [x12]\n"
+      "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6462e428  // bfmmla z8.s, z1.h, z2.h\n"
+      ".inst 0x6462e470  // bfmmla z16.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b8  // bfmmla z24.s, z5.h, z2.h\n"
+      ".inst 0x6460e42c  // bfmmla z12.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x11]\n"
       "addvl x12, x12, #2\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bc  // bfmmla z28.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6460e474  // bfmmla z20.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bc  // bfmmla z28.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
       "addvl x11, x11, #2\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bd  // bfmmla z29.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e429  // bfmmla z9.s, z1.h, z2.h\n"
+      ".inst 0x6462e471  // bfmmla z17.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b9  // bfmmla z25.s, z5.h, z2.h\n"
+      ".inst 0x6460e42d  // bfmmla z13.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      ".inst 0x6460e475  // bfmmla z21.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
       "addvl x10, x10, #2\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      ".inst 0x6467e4ba  // bfmmla z26.s, z5.h, z7.h\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      ".inst 0x6466e4be  // bfmmla z30.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6462e42a  // bfmmla z10.s, z1.h, z2.h\n"
+      ".inst 0x6462e472  // bfmmla z18.s, z3.h, z2.h\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6460e42e  // bfmmla z14.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x9]\n"
+      ".inst 0x6460e476  // bfmmla z22.s, z3.h, z0.h\n"
+      ".inst 0x6460e4be  // bfmmla z30.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
       "addvl x9, x9, #2\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6467e4bb  // bfmmla z27.s, z5.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bf  // bfmmla z31.s, z5.h, z6.h\n"
+      ".inst 0x6462e42b  // bfmmla z11.s, z1.h, z2.h\n"
+      ".inst 0x6462e473  // bfmmla z19.s, z3.h, z2.h\n"
+      ".inst 0x6462e4bb  // bfmmla z27.s, z5.h, z2.h\n"
+      ".inst 0x6460e42f  // bfmmla z15.s, z1.h, z0.h\n"
+      ".inst 0x6460e477  // bfmmla z23.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
       "82:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp
index acbc619..c42ad7e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp
index 5f093bf..66601bd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp
@@ -163,11 +163,11 @@
       "7:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 8f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 9f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -183,12 +183,12 @@
       "10:"  // Height 1: Multiply loop: Main loop
       "fmla z8.h, p4/M, z6.h, z0.h\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
-      "ld1h { z6.h }, p4/Z, [x10]\n"
-      "ld1h { z7.h }, p4/Z, [x9]\n"
+      "ld1h { z17.h }, p4/Z, [x10]\n"
+      "ld1h { z16.h }, p4/Z, [x9]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
       "add x26, x26, #0x2\n"
       "subs x27, x27, #0x1\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -201,12 +201,12 @@
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "fmla z8.h, p4/M, z6.h, z0.h\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
-      "ld1h { z6.h }, p4/Z, [x10]\n"
-      "ld1h { z7.h }, p4/Z, [x9]\n"
+      "ld1h { z17.h }, p4/Z, [x10]\n"
+      "ld1h { z16.h }, p4/Z, [x9]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
@@ -214,17 +214,17 @@
       "bne 7b\n"
       "tbz %x[flags], #1, 12f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p4/Z, [x20]\n"
+      "ld1rh { z17.h }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p4/Z, [x20]\n"
-      "fmin z8.h, p4/M, z8.h, z1.h\n"
-      "fmin z9.h, p4/M, z9.h, z1.h\n"
-      "fmin z10.h, p4/M, z10.h, z1.h\n"
-      "fmin z11.h, p4/M, z11.h, z1.h\n"
-      "fmax z8.h, p4/M, z8.h, z0.h\n"
-      "fmax z9.h, p4/M, z9.h, z0.h\n"
-      "fmax z10.h, p4/M, z10.h, z0.h\n"
-      "fmax z11.h, p4/M, z11.h, z0.h\n"
+      "ld1rh { z16.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z17.h\n"
+      "fmin z9.h, p4/M, z9.h, z17.h\n"
+      "fmin z10.h, p4/M, z10.h, z17.h\n"
+      "fmin z11.h, p4/M, z11.h, z17.h\n"
+      "fmax z8.h, p4/M, z8.h, z16.h\n"
+      "fmax z9.h, p4/M, z9.h, z16.h\n"
+      "fmax z10.h, p4/M, z10.h, z16.h\n"
+      "fmax z11.h, p4/M, z11.h, z16.h\n"
       "12:"  // Height 1: No activation
       "st1h { z8.h }, p3, [x13]\n"
       "st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
@@ -285,15 +285,15 @@
       "17:"  // Height 2: no bias
       "tbz %x[flags], #0, 18f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #1\n"
+      "add x20, x13, x20, LSL #1\n"
       "ld1h { z8.h }, p3/Z, [x13]\n"
       "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
       "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
       "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
-      "ld1h { z12.h }, p3/Z, [x25]\n"
-      "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x20]\n"
+      "ld1h { z13.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x20, #3, MUL VL]\n"
       "b 19f\n"
       "18:"  // Height 2: no accumulate
       "mov z8.b, #0x0\n"
@@ -309,12 +309,12 @@
       "20:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 21f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 22f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -322,7 +322,7 @@
       "b 22f\n"
       "21:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
       "22:"  // Height 2: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -333,19 +333,19 @@
       "23:"  // Height 2: Multiply loop: Main loop
       "fmla z8.h, p4/M, z6.h, z0.h\n"
       "fmla z12.h, p4/M, z6.h, z1.h\n"
-      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z17.h }, p4/Z, [x10]\n"
       "addvl x12, x12, #1\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
-      "ld1h { z7.h }, p4/Z, [x9]\n"
+      "ld1h { z16.h }, p4/Z, [x9]\n"
       "addvl x11, x11, #1\n"
       "add x26, x26, #0x2\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z14.h, p4/M, z17.h, z1.h\n"
       "add x25, x25, #0x2\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
+      "fmla z15.h, p4/M, z16.h, z1.h\n"
       "addvl x10, x10, #1\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
       "ld1rh { z1.h }, p4/Z, [x25]\n"
@@ -357,18 +357,18 @@
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "fmla z8.h, p4/M, z6.h, z0.h\n"
       "fmla z12.h, p4/M, z6.h, z1.h\n"
-      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z17.h }, p4/Z, [x10]\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
-      "ld1h { z7.h }, p4/Z, [x9]\n"
+      "ld1h { z16.h }, p4/Z, [x9]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z14.h, p4/M, z17.h, z1.h\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
+      "fmla z15.h, p4/M, z16.h, z1.h\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "bne 20b\n"
@@ -376,25 +376,25 @@
       "add x25, x13, x20, LSL #1\n"
       "tbz %x[flags], #1, 25f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p4/Z, [x20]\n"
+      "ld1rh { z17.h }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p4/Z, [x20]\n"
-      "fmin z8.h, p4/M, z8.h, z1.h\n"
-      "fmin z9.h, p4/M, z9.h, z1.h\n"
-      "fmin z10.h, p4/M, z10.h, z1.h\n"
-      "fmin z11.h, p4/M, z11.h, z1.h\n"
-      "fmin z12.h, p4/M, z12.h, z1.h\n"
-      "fmin z13.h, p4/M, z13.h, z1.h\n"
-      "fmin z14.h, p4/M, z14.h, z1.h\n"
-      "fmin z15.h, p4/M, z15.h, z1.h\n"
-      "fmax z8.h, p4/M, z8.h, z0.h\n"
-      "fmax z9.h, p4/M, z9.h, z0.h\n"
-      "fmax z10.h, p4/M, z10.h, z0.h\n"
-      "fmax z11.h, p4/M, z11.h, z0.h\n"
-      "fmax z12.h, p4/M, z12.h, z0.h\n"
-      "fmax z13.h, p4/M, z13.h, z0.h\n"
-      "fmax z14.h, p4/M, z14.h, z0.h\n"
-      "fmax z15.h, p4/M, z15.h, z0.h\n"
+      "ld1rh { z16.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z17.h\n"
+      "fmin z9.h, p4/M, z9.h, z17.h\n"
+      "fmin z10.h, p4/M, z10.h, z17.h\n"
+      "fmin z11.h, p4/M, z11.h, z17.h\n"
+      "fmin z12.h, p4/M, z12.h, z17.h\n"
+      "fmin z13.h, p4/M, z13.h, z17.h\n"
+      "fmin z14.h, p4/M, z14.h, z17.h\n"
+      "fmin z15.h, p4/M, z15.h, z17.h\n"
+      "fmax z8.h, p4/M, z8.h, z16.h\n"
+      "fmax z9.h, p4/M, z9.h, z16.h\n"
+      "fmax z10.h, p4/M, z10.h, z16.h\n"
+      "fmax z11.h, p4/M, z11.h, z16.h\n"
+      "fmax z12.h, p4/M, z12.h, z16.h\n"
+      "fmax z13.h, p4/M, z13.h, z16.h\n"
+      "fmax z14.h, p4/M, z14.h, z16.h\n"
+      "fmax z15.h, p4/M, z15.h, z16.h\n"
       "25:"  // Height 2: No activation
       "st1h { z8.h }, p3, [x13]\n"
       "st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
@@ -463,20 +463,20 @@
       "30:"  // Height 3: no bias
       "tbz %x[flags], #0, 31f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x21, x13, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z8.h }, p3/Z, [x13]\n"
       "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
       "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
       "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
-      "ld1h { z12.h }, p3/Z, [x25]\n"
-      "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p3/Z, [x24]\n"
-      "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x21]\n"
+      "ld1h { z13.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x20]\n"
+      "ld1h { z17.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x20, #3, MUL VL]\n"
       "b 32f\n"
       "31:"  // Height 3: no accumulate
       "mov z8.b, #0x0\n"
@@ -496,13 +496,13 @@
       "33:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 34f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 35f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -511,8 +511,8 @@
       "b 35f\n"
       "34:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "35:"  // Height 3: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -528,22 +528,22 @@
       "addvl x11, x11, #1\n"
       "fmla z16.h, p4/M, z6.h, z2.h\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
-      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z21.h }, p4/Z, [x10]\n"
       "add x26, x26, #0x2\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
       "fmla z17.h, p4/M, z7.h, z2.h\n"
-      "ld1h { z7.h }, p4/Z, [x9]\n"
+      "ld1h { z20.h }, p4/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
       "add x25, x25, #0x2\n"
       "add x24, x24, #0x2\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
-      "fmla z18.h, p4/M, z6.h, z2.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z10.h, p4/M, z21.h, z0.h\n"
+      "fmla z14.h, p4/M, z21.h, z1.h\n"
+      "fmla z18.h, p4/M, z21.h, z2.h\n"
+      "fmla z11.h, p4/M, z20.h, z0.h\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
-      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z15.h, p4/M, z20.h, z1.h\n"
+      "fmla z19.h, p4/M, z20.h, z2.h\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
       "ld1rh { z1.h }, p4/Z, [x25]\n"
       "ld1rh { z2.h }, p4/Z, [x24]\n"
@@ -557,54 +557,54 @@
       "add x28, x28, #0x1\n"
       "fmla z16.h, p4/M, z6.h, z2.h\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
-      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z21.h }, p4/Z, [x10]\n"
       "cmp x28, x20\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
       "fmla z17.h, p4/M, z7.h, z2.h\n"
-      "ld1h { z7.h }, p4/Z, [x9]\n"
+      "ld1h { z20.h }, p4/Z, [x9]\n"
       "addvl x12, x12, #1\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z10.h, p4/M, z21.h, z0.h\n"
+      "fmla z14.h, p4/M, z21.h, z1.h\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z18.h, p4/M, z6.h, z2.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z18.h, p4/M, z21.h, z2.h\n"
+      "fmla z11.h, p4/M, z20.h, z0.h\n"
       "addvl x9, x9, #1\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
-      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z15.h, p4/M, z20.h, z1.h\n"
+      "fmla z19.h, p4/M, z20.h, z2.h\n"
       "bne 33b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x25, x13, x20, LSL #1\n"
       "add x24, x25, x20, LSL #1\n"
       "tbz %x[flags], #1, 38f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p4/Z, [x20]\n"
+      "ld1rh { z21.h }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p4/Z, [x20]\n"
-      "fmin z8.h, p4/M, z8.h, z1.h\n"
-      "fmin z9.h, p4/M, z9.h, z1.h\n"
-      "fmin z10.h, p4/M, z10.h, z1.h\n"
-      "fmin z11.h, p4/M, z11.h, z1.h\n"
-      "fmin z12.h, p4/M, z12.h, z1.h\n"
-      "fmin z13.h, p4/M, z13.h, z1.h\n"
-      "fmin z14.h, p4/M, z14.h, z1.h\n"
-      "fmin z15.h, p4/M, z15.h, z1.h\n"
-      "fmin z16.h, p4/M, z16.h, z1.h\n"
-      "fmin z17.h, p4/M, z17.h, z1.h\n"
-      "fmin z18.h, p4/M, z18.h, z1.h\n"
-      "fmin z19.h, p4/M, z19.h, z1.h\n"
-      "fmax z8.h, p4/M, z8.h, z0.h\n"
-      "fmax z9.h, p4/M, z9.h, z0.h\n"
-      "fmax z10.h, p4/M, z10.h, z0.h\n"
-      "fmax z11.h, p4/M, z11.h, z0.h\n"
-      "fmax z12.h, p4/M, z12.h, z0.h\n"
-      "fmax z13.h, p4/M, z13.h, z0.h\n"
-      "fmax z14.h, p4/M, z14.h, z0.h\n"
-      "fmax z15.h, p4/M, z15.h, z0.h\n"
-      "fmax z16.h, p4/M, z16.h, z0.h\n"
-      "fmax z17.h, p4/M, z17.h, z0.h\n"
-      "fmax z18.h, p4/M, z18.h, z0.h\n"
-      "fmax z19.h, p4/M, z19.h, z0.h\n"
+      "ld1rh { z20.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z21.h\n"
+      "fmin z9.h, p4/M, z9.h, z21.h\n"
+      "fmin z10.h, p4/M, z10.h, z21.h\n"
+      "fmin z11.h, p4/M, z11.h, z21.h\n"
+      "fmin z12.h, p4/M, z12.h, z21.h\n"
+      "fmin z13.h, p4/M, z13.h, z21.h\n"
+      "fmin z14.h, p4/M, z14.h, z21.h\n"
+      "fmin z15.h, p4/M, z15.h, z21.h\n"
+      "fmin z16.h, p4/M, z16.h, z21.h\n"
+      "fmin z17.h, p4/M, z17.h, z21.h\n"
+      "fmin z18.h, p4/M, z18.h, z21.h\n"
+      "fmin z19.h, p4/M, z19.h, z21.h\n"
+      "fmax z8.h, p4/M, z8.h, z20.h\n"
+      "fmax z9.h, p4/M, z9.h, z20.h\n"
+      "fmax z10.h, p4/M, z10.h, z20.h\n"
+      "fmax z11.h, p4/M, z11.h, z20.h\n"
+      "fmax z12.h, p4/M, z12.h, z20.h\n"
+      "fmax z13.h, p4/M, z13.h, z20.h\n"
+      "fmax z14.h, p4/M, z14.h, z20.h\n"
+      "fmax z15.h, p4/M, z15.h, z20.h\n"
+      "fmax z16.h, p4/M, z16.h, z20.h\n"
+      "fmax z17.h, p4/M, z17.h, z20.h\n"
+      "fmax z18.h, p4/M, z18.h, z20.h\n"
+      "fmax z19.h, p4/M, z19.h, z20.h\n"
       "38:"  // Height 3: No activation
       "st1h { z8.h }, p3, [x13]\n"
       "st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
@@ -681,25 +681,25 @@
       "43:"  // Height 4: no bias
       "tbz %x[flags], #0, 44f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x22, x13, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
       "ld1h { z8.h }, p3/Z, [x13]\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
       "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
       "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
-      "ld1h { z12.h }, p3/Z, [x25]\n"
-      "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p3/Z, [x24]\n"
-      "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z20.h }, p3/Z, [x23]\n"
-      "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x22]\n"
+      "ld1h { z13.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x21]\n"
+      "ld1h { z17.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x20]\n"
+      "ld1h { z21.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x20, #3, MUL VL]\n"
       "b 45f\n"
       "44:"  // Height 4: no accumulate
       "mov z8.b, #0x0\n"
@@ -723,14 +723,14 @@
       "46:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 47f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 48f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -740,9 +740,9 @@
       "b 48f\n"
       "47:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "48:"  // Height 4: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -759,7 +759,7 @@
       "addvl x11, x11, #1\n"
       "fmla z16.h, p4/M, z6.h, z2.h\n"
       "fmla z20.h, p4/M, z6.h, z3.h\n"
-      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z25.h }, p4/Z, [x10]\n"
       "add x26, x26, #0x2\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
@@ -767,22 +767,22 @@
       "add x25, x25, #0x2\n"
       "fmla z17.h, p4/M, z7.h, z2.h\n"
       "fmla z21.h, p4/M, z7.h, z3.h\n"
-      "ld1h { z7.h }, p4/Z, [x9]\n"
+      "ld1h { z24.h }, p4/Z, [x9]\n"
       "add x24, x24, #0x2\n"
       "add x23, x23, #0x2\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z10.h, p4/M, z25.h, z0.h\n"
+      "fmla z14.h, p4/M, z25.h, z1.h\n"
       "addvl x10, x10, #1\n"
-      "fmla z18.h, p4/M, z6.h, z2.h\n"
-      "fmla z22.h, p4/M, z6.h, z3.h\n"
+      "fmla z18.h, p4/M, z25.h, z2.h\n"
+      "fmla z22.h, p4/M, z25.h, z3.h\n"
       "addvl x9, x9, #1\n"
       "ld1h { z6.h }, p4/Z, [x12]\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "fmla z11.h, p4/M, z24.h, z0.h\n"
+      "fmla z15.h, p4/M, z24.h, z1.h\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
       "ld1rh { z1.h }, p4/Z, [x25]\n"
-      "fmla z19.h, p4/M, z7.h, z2.h\n"
-      "fmla z23.h, p4/M, z7.h, z3.h\n"
+      "fmla z19.h, p4/M, z24.h, z2.h\n"
+      "fmla z23.h, p4/M, z24.h, z3.h\n"
       "ld1rh { z2.h }, p4/Z, [x24]\n"
       "ld1rh { z3.h }, p4/Z, [x23]\n"
       "ld1h { z7.h }, p4/Z, [x11]\n"
@@ -794,7 +794,7 @@
       "add x28, x28, #0x1\n"
       "fmla z16.h, p4/M, z6.h, z2.h\n"
       "fmla z20.h, p4/M, z6.h, z3.h\n"
-      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z25.h }, p4/Z, [x10]\n"
       "cmp x28, x20\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
@@ -802,17 +802,17 @@
       "addvl x11, x11, #1\n"
       "fmla z17.h, p4/M, z7.h, z2.h\n"
       "fmla z21.h, p4/M, z7.h, z3.h\n"
-      "ld1h { z7.h }, p4/Z, [x9]\n"
+      "ld1h { z24.h }, p4/Z, [x9]\n"
       "addvl x10, x10, #1\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z10.h, p4/M, z25.h, z0.h\n"
+      "fmla z14.h, p4/M, z25.h, z1.h\n"
       "addvl x9, x9, #1\n"
-      "fmla z18.h, p4/M, z6.h, z2.h\n"
-      "fmla z22.h, p4/M, z6.h, z3.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
-      "fmla z19.h, p4/M, z7.h, z2.h\n"
-      "fmla z23.h, p4/M, z7.h, z3.h\n"
+      "fmla z18.h, p4/M, z25.h, z2.h\n"
+      "fmla z22.h, p4/M, z25.h, z3.h\n"
+      "fmla z11.h, p4/M, z24.h, z0.h\n"
+      "fmla z15.h, p4/M, z24.h, z1.h\n"
+      "fmla z19.h, p4/M, z24.h, z2.h\n"
+      "fmla z23.h, p4/M, z24.h, z3.h\n"
       "bne 46b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x25, x13, x20, LSL #1\n"
@@ -820,41 +820,41 @@
       "add x23, x24, x20, LSL #1\n"
       "tbz %x[flags], #1, 51f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p4/Z, [x20]\n"
+      "ld1rh { z25.h }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p4/Z, [x20]\n"
-      "fmin z8.h, p4/M, z8.h, z1.h\n"
-      "fmin z9.h, p4/M, z9.h, z1.h\n"
-      "fmin z10.h, p4/M, z10.h, z1.h\n"
-      "fmin z11.h, p4/M, z11.h, z1.h\n"
-      "fmin z12.h, p4/M, z12.h, z1.h\n"
-      "fmin z13.h, p4/M, z13.h, z1.h\n"
-      "fmin z14.h, p4/M, z14.h, z1.h\n"
-      "fmin z15.h, p4/M, z15.h, z1.h\n"
-      "fmin z16.h, p4/M, z16.h, z1.h\n"
-      "fmin z17.h, p4/M, z17.h, z1.h\n"
-      "fmin z18.h, p4/M, z18.h, z1.h\n"
-      "fmin z19.h, p4/M, z19.h, z1.h\n"
-      "fmin z20.h, p4/M, z20.h, z1.h\n"
-      "fmin z21.h, p4/M, z21.h, z1.h\n"
-      "fmin z22.h, p4/M, z22.h, z1.h\n"
-      "fmin z23.h, p4/M, z23.h, z1.h\n"
-      "fmax z8.h, p4/M, z8.h, z0.h\n"
-      "fmax z9.h, p4/M, z9.h, z0.h\n"
-      "fmax z10.h, p4/M, z10.h, z0.h\n"
-      "fmax z11.h, p4/M, z11.h, z0.h\n"
-      "fmax z12.h, p4/M, z12.h, z0.h\n"
-      "fmax z13.h, p4/M, z13.h, z0.h\n"
-      "fmax z14.h, p4/M, z14.h, z0.h\n"
-      "fmax z15.h, p4/M, z15.h, z0.h\n"
-      "fmax z16.h, p4/M, z16.h, z0.h\n"
-      "fmax z17.h, p4/M, z17.h, z0.h\n"
-      "fmax z18.h, p4/M, z18.h, z0.h\n"
-      "fmax z19.h, p4/M, z19.h, z0.h\n"
-      "fmax z20.h, p4/M, z20.h, z0.h\n"
-      "fmax z21.h, p4/M, z21.h, z0.h\n"
-      "fmax z22.h, p4/M, z22.h, z0.h\n"
-      "fmax z23.h, p4/M, z23.h, z0.h\n"
+      "ld1rh { z24.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z25.h\n"
+      "fmin z9.h, p4/M, z9.h, z25.h\n"
+      "fmin z10.h, p4/M, z10.h, z25.h\n"
+      "fmin z11.h, p4/M, z11.h, z25.h\n"
+      "fmin z12.h, p4/M, z12.h, z25.h\n"
+      "fmin z13.h, p4/M, z13.h, z25.h\n"
+      "fmin z14.h, p4/M, z14.h, z25.h\n"
+      "fmin z15.h, p4/M, z15.h, z25.h\n"
+      "fmin z16.h, p4/M, z16.h, z25.h\n"
+      "fmin z17.h, p4/M, z17.h, z25.h\n"
+      "fmin z18.h, p4/M, z18.h, z25.h\n"
+      "fmin z19.h, p4/M, z19.h, z25.h\n"
+      "fmin z20.h, p4/M, z20.h, z25.h\n"
+      "fmin z21.h, p4/M, z21.h, z25.h\n"
+      "fmin z22.h, p4/M, z22.h, z25.h\n"
+      "fmin z23.h, p4/M, z23.h, z25.h\n"
+      "fmax z8.h, p4/M, z8.h, z24.h\n"
+      "fmax z9.h, p4/M, z9.h, z24.h\n"
+      "fmax z10.h, p4/M, z10.h, z24.h\n"
+      "fmax z11.h, p4/M, z11.h, z24.h\n"
+      "fmax z12.h, p4/M, z12.h, z24.h\n"
+      "fmax z13.h, p4/M, z13.h, z24.h\n"
+      "fmax z14.h, p4/M, z14.h, z24.h\n"
+      "fmax z15.h, p4/M, z15.h, z24.h\n"
+      "fmax z16.h, p4/M, z16.h, z24.h\n"
+      "fmax z17.h, p4/M, z17.h, z24.h\n"
+      "fmax z18.h, p4/M, z18.h, z24.h\n"
+      "fmax z19.h, p4/M, z19.h, z24.h\n"
+      "fmax z20.h, p4/M, z20.h, z24.h\n"
+      "fmax z21.h, p4/M, z21.h, z24.h\n"
+      "fmax z22.h, p4/M, z22.h, z24.h\n"
+      "fmax z23.h, p4/M, z23.h, z24.h\n"
       "51:"  // Height 4: No activation
       "st1h { z8.h }, p3, [x13]\n"
       "st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
@@ -939,30 +939,30 @@
       "56:"  // Height 5: no bias
       "tbz %x[flags], #0, 57f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "ld1h { z8.h }, p3/Z, [x13]\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x23, x13, x20, LSL #1\n"
       "add x22, x23, x20, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x13]\n"
+      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
       "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
       "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
-      "ld1h { z12.h }, p3/Z, [x25]\n"
-      "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p3/Z, [x24]\n"
-      "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z20.h }, p3/Z, [x23]\n"
-      "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n"
-      "ld1h { z24.h }, p3/Z, [x22]\n"
-      "ld1h { z25.h }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1h { z26.h }, p1/Z, [x22, #2, MUL VL]\n"
-      "ld1h { z27.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x23]\n"
+      "ld1h { z13.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x22]\n"
+      "ld1h { z17.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x21]\n"
+      "ld1h { z21.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z24.h }, p3/Z, [x20]\n"
+      "ld1h { z25.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z26.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z27.h }, p0/Z, [x20, #3, MUL VL]\n"
       "b 58f\n"
       "57:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -990,15 +990,15 @@
       "59:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 60f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 61f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1009,10 +1009,10 @@
       "b 61f\n"
       "60:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "61:"  // Height 5: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -1034,7 +1034,7 @@
       "subs x27, x27, #0x1\n"
       "fmla z24.h, p4/M, z6.h, z4.h\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
-      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z29.h }, p4/Z, [x10]\n"
       "add x25, x25, #0x2\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
       "fmla z17.h, p4/M, z7.h, z2.h\n"
@@ -1042,24 +1042,24 @@
       "add x23, x23, #0x2\n"
       "fmla z21.h, p4/M, z7.h, z3.h\n"
       "fmla z25.h, p4/M, z7.h, z4.h\n"
-      "ld1h { z7.h }, p4/Z, [x9]\n"
+      "ld1h { z28.h }, p4/Z, [x9]\n"
       "add x22, x22, #0x2\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z10.h, p4/M, z29.h, z0.h\n"
+      "fmla z14.h, p4/M, z29.h, z1.h\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
-      "fmla z18.h, p4/M, z6.h, z2.h\n"
-      "fmla z22.h, p4/M, z6.h, z3.h\n"
-      "fmla z26.h, p4/M, z6.h, z4.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z18.h, p4/M, z29.h, z2.h\n"
+      "fmla z22.h, p4/M, z29.h, z3.h\n"
+      "fmla z26.h, p4/M, z29.h, z4.h\n"
+      "fmla z11.h, p4/M, z28.h, z0.h\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
       "ld1h { z6.h }, p4/Z, [x12]\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
-      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z15.h, p4/M, z28.h, z1.h\n"
+      "fmla z19.h, p4/M, z28.h, z2.h\n"
       "ld1rh { z1.h }, p4/Z, [x25]\n"
       "ld1rh { z2.h }, p4/Z, [x24]\n"
-      "fmla z23.h, p4/M, z7.h, z3.h\n"
-      "fmla z27.h, p4/M, z7.h, z4.h\n"
+      "fmla z23.h, p4/M, z28.h, z3.h\n"
+      "fmla z27.h, p4/M, z28.h, z4.h\n"
       "ld1rh { z3.h }, p4/Z, [x23]\n"
       "ld1rh { z4.h }, p4/Z, [x22]\n"
       "ld1h { z7.h }, p4/Z, [x11]\n"
@@ -1075,25 +1075,25 @@
       "addvl x12, x12, #1\n"
       "fmla z24.h, p4/M, z6.h, z4.h\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
-      "ld1h { z6.h }, p4/Z, [x10]\n"
+      "ld1h { z29.h }, p4/Z, [x10]\n"
       "addvl x11, x11, #1\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
       "fmla z17.h, p4/M, z7.h, z2.h\n"
       "addvl x10, x10, #1\n"
       "fmla z21.h, p4/M, z7.h, z3.h\n"
       "fmla z25.h, p4/M, z7.h, z4.h\n"
-      "ld1h { z7.h }, p4/Z, [x9]\n"
+      "ld1h { z28.h }, p4/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
-      "fmla z18.h, p4/M, z6.h, z2.h\n"
-      "fmla z22.h, p4/M, z6.h, z3.h\n"
-      "fmla z26.h, p4/M, z6.h, z4.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
-      "fmla z19.h, p4/M, z7.h, z2.h\n"
-      "fmla z23.h, p4/M, z7.h, z3.h\n"
-      "fmla z27.h, p4/M, z7.h, z4.h\n"
+      "fmla z10.h, p4/M, z29.h, z0.h\n"
+      "fmla z14.h, p4/M, z29.h, z1.h\n"
+      "fmla z18.h, p4/M, z29.h, z2.h\n"
+      "fmla z22.h, p4/M, z29.h, z3.h\n"
+      "fmla z26.h, p4/M, z29.h, z4.h\n"
+      "fmla z11.h, p4/M, z28.h, z0.h\n"
+      "fmla z15.h, p4/M, z28.h, z1.h\n"
+      "fmla z19.h, p4/M, z28.h, z2.h\n"
+      "fmla z23.h, p4/M, z28.h, z3.h\n"
+      "fmla z27.h, p4/M, z28.h, z4.h\n"
       "bne 59b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x25, x13, x20, LSL #1\n"
@@ -1102,49 +1102,49 @@
       "add x22, x23, x20, LSL #1\n"
       "tbz %x[flags], #1, 64f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p4/Z, [x20]\n"
+      "ld1rh { z29.h }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p4/Z, [x20]\n"
-      "fmin z8.h, p4/M, z8.h, z1.h\n"
-      "fmin z9.h, p4/M, z9.h, z1.h\n"
-      "fmin z10.h, p4/M, z10.h, z1.h\n"
-      "fmin z11.h, p4/M, z11.h, z1.h\n"
-      "fmin z12.h, p4/M, z12.h, z1.h\n"
-      "fmin z13.h, p4/M, z13.h, z1.h\n"
-      "fmin z14.h, p4/M, z14.h, z1.h\n"
-      "fmin z15.h, p4/M, z15.h, z1.h\n"
-      "fmin z16.h, p4/M, z16.h, z1.h\n"
-      "fmin z17.h, p4/M, z17.h, z1.h\n"
-      "fmin z18.h, p4/M, z18.h, z1.h\n"
-      "fmin z19.h, p4/M, z19.h, z1.h\n"
-      "fmin z20.h, p4/M, z20.h, z1.h\n"
-      "fmin z21.h, p4/M, z21.h, z1.h\n"
-      "fmin z22.h, p4/M, z22.h, z1.h\n"
-      "fmin z23.h, p4/M, z23.h, z1.h\n"
-      "fmin z24.h, p4/M, z24.h, z1.h\n"
-      "fmin z25.h, p4/M, z25.h, z1.h\n"
-      "fmin z26.h, p4/M, z26.h, z1.h\n"
-      "fmin z27.h, p4/M, z27.h, z1.h\n"
-      "fmax z8.h, p4/M, z8.h, z0.h\n"
-      "fmax z9.h, p4/M, z9.h, z0.h\n"
-      "fmax z10.h, p4/M, z10.h, z0.h\n"
-      "fmax z11.h, p4/M, z11.h, z0.h\n"
-      "fmax z12.h, p4/M, z12.h, z0.h\n"
-      "fmax z13.h, p4/M, z13.h, z0.h\n"
-      "fmax z14.h, p4/M, z14.h, z0.h\n"
-      "fmax z15.h, p4/M, z15.h, z0.h\n"
-      "fmax z16.h, p4/M, z16.h, z0.h\n"
-      "fmax z17.h, p4/M, z17.h, z0.h\n"
-      "fmax z18.h, p4/M, z18.h, z0.h\n"
-      "fmax z19.h, p4/M, z19.h, z0.h\n"
-      "fmax z20.h, p4/M, z20.h, z0.h\n"
-      "fmax z21.h, p4/M, z21.h, z0.h\n"
-      "fmax z22.h, p4/M, z22.h, z0.h\n"
-      "fmax z23.h, p4/M, z23.h, z0.h\n"
-      "fmax z24.h, p4/M, z24.h, z0.h\n"
-      "fmax z25.h, p4/M, z25.h, z0.h\n"
-      "fmax z26.h, p4/M, z26.h, z0.h\n"
-      "fmax z27.h, p4/M, z27.h, z0.h\n"
+      "ld1rh { z28.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z29.h\n"
+      "fmin z9.h, p4/M, z9.h, z29.h\n"
+      "fmin z10.h, p4/M, z10.h, z29.h\n"
+      "fmin z11.h, p4/M, z11.h, z29.h\n"
+      "fmin z12.h, p4/M, z12.h, z29.h\n"
+      "fmin z13.h, p4/M, z13.h, z29.h\n"
+      "fmin z14.h, p4/M, z14.h, z29.h\n"
+      "fmin z15.h, p4/M, z15.h, z29.h\n"
+      "fmin z16.h, p4/M, z16.h, z29.h\n"
+      "fmin z17.h, p4/M, z17.h, z29.h\n"
+      "fmin z18.h, p4/M, z18.h, z29.h\n"
+      "fmin z19.h, p4/M, z19.h, z29.h\n"
+      "fmin z20.h, p4/M, z20.h, z29.h\n"
+      "fmin z21.h, p4/M, z21.h, z29.h\n"
+      "fmin z22.h, p4/M, z22.h, z29.h\n"
+      "fmin z23.h, p4/M, z23.h, z29.h\n"
+      "fmin z24.h, p4/M, z24.h, z29.h\n"
+      "fmin z25.h, p4/M, z25.h, z29.h\n"
+      "fmin z26.h, p4/M, z26.h, z29.h\n"
+      "fmin z27.h, p4/M, z27.h, z29.h\n"
+      "fmax z8.h, p4/M, z8.h, z28.h\n"
+      "fmax z9.h, p4/M, z9.h, z28.h\n"
+      "fmax z10.h, p4/M, z10.h, z28.h\n"
+      "fmax z11.h, p4/M, z11.h, z28.h\n"
+      "fmax z12.h, p4/M, z12.h, z28.h\n"
+      "fmax z13.h, p4/M, z13.h, z28.h\n"
+      "fmax z14.h, p4/M, z14.h, z28.h\n"
+      "fmax z15.h, p4/M, z15.h, z28.h\n"
+      "fmax z16.h, p4/M, z16.h, z28.h\n"
+      "fmax z17.h, p4/M, z17.h, z28.h\n"
+      "fmax z18.h, p4/M, z18.h, z28.h\n"
+      "fmax z19.h, p4/M, z19.h, z28.h\n"
+      "fmax z20.h, p4/M, z20.h, z28.h\n"
+      "fmax z21.h, p4/M, z21.h, z28.h\n"
+      "fmax z22.h, p4/M, z22.h, z28.h\n"
+      "fmax z23.h, p4/M, z23.h, z28.h\n"
+      "fmax z24.h, p4/M, z24.h, z28.h\n"
+      "fmax z25.h, p4/M, z25.h, z28.h\n"
+      "fmax z26.h, p4/M, z26.h, z28.h\n"
+      "fmax z27.h, p4/M, z27.h, z28.h\n"
       "64:"  // Height 5: No activation
       "st1h { z8.h }, p3, [x13]\n"
       "st1h { z9.h }, p2, [x13, #1, MUL VL]\n"
@@ -1240,35 +1240,35 @@
       "69:"  // Height 6: no bias
       "tbz %x[flags], #0, 70f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "ld1h { z8.h }, p3/Z, [x13]\n"
+      "add x24, x13, x20, LSL #1\n"
       "add x23, x24, x20, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x13]\n"
       "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
       "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n"
       "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n"
-      "ld1h { z12.h }, p3/Z, [x25]\n"
-      "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p3/Z, [x24]\n"
-      "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z20.h }, p3/Z, [x23]\n"
-      "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n"
-      "ld1h { z24.h }, p3/Z, [x22]\n"
-      "ld1h { z25.h }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1h { z26.h }, p1/Z, [x22, #2, MUL VL]\n"
-      "ld1h { z27.h }, p0/Z, [x22, #3, MUL VL]\n"
-      "ld1h { z28.h }, p3/Z, [x21]\n"
-      "ld1h { z29.h }, p2/Z, [x21, #1, MUL VL]\n"
-      "ld1h { z30.h }, p1/Z, [x21, #2, MUL VL]\n"
-      "ld1h { z31.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x24]\n"
+      "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x23]\n"
+      "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x22]\n"
+      "ld1h { z21.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z24.h }, p3/Z, [x21]\n"
+      "ld1h { z25.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z26.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z27.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z28.h }, p3/Z, [x20]\n"
+      "ld1h { z29.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z30.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z31.h }, p0/Z, [x20, #3, MUL VL]\n"
       "b 71f\n"
       "70:"  // Height 6: no accumulate
       "mov z8.b, #0x0\n"
@@ -1300,16 +1300,16 @@
       "72:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 73f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 74f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1321,11 +1321,11 @@
       "b 74f\n"
       "73:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "74:"  // Height 6: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -1527,4 +1527,4 @@
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp
index 0b543b6..842db1a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp
@@ -163,11 +163,11 @@
       "7:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 8f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 9f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -180,72 +180,72 @@
       "10:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x12]\n"
+      "fmla z8.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z9.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10]\n"
+      "fmla z10.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "fmla z11.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #4, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #4, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #5, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #5, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #5, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #6, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #6, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #6, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x12, #7, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[7]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #7, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #7, MUL VL]\n"
       "sub x27, x27, #0x8\n"
-      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #7, MUL VL]\n"
       "cmp x27, #0x8\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z10.h, z17.h, z0.h[7]\n"
+      "fmla z11.h, z16.h, z0.h[7]\n"
       "add x26, x26, #0x10\n"
       "addvl x12, x12, #8\n"
       "addvl x11, x11, #8\n"
@@ -255,112 +255,112 @@
       "11:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x12]\n"
+      "fmla z8.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z9.h, z16.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z17.h, z0.h[0]\n"
+      "fmla z11.h, z16.h, z0.h[0]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 12f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[1]\n"
+      "fmla z9.h, z16.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z10.h, z17.h, z0.h[1]\n"
+      "fmla z11.h, z16.h, z0.h[1]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 12f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[2]\n"
+      "fmla z9.h, z16.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z10.h, z17.h, z0.h[2]\n"
+      "fmla z11.h, z16.h, z0.h[2]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 12f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[3]\n"
+      "fmla z9.h, z16.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z10.h, z17.h, z0.h[3]\n"
+      "fmla z11.h, z16.h, z0.h[3]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 12f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[4]\n"
+      "fmla z9.h, z16.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z10.h, z17.h, z0.h[4]\n"
+      "fmla z11.h, z16.h, z0.h[4]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 12f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[5]\n"
+      "fmla z9.h, z16.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z10.h, z17.h, z0.h[5]\n"
+      "fmla z11.h, z16.h, z0.h[5]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 12f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[6]\n"
+      "fmla z9.h, z16.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z10.h, z17.h, z0.h[6]\n"
+      "fmla z11.h, z16.h, z0.h[6]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 12f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[7]\n"
+      "fmla z9.h, z16.h, z0.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z17.h, z0.h[7]\n"
+      "fmla z11.h, z16.h, z0.h[7]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
@@ -372,17 +372,17 @@
       "bne 7b\n"
       "tbz %x[flags], #1, 13f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p5/Z, [x20]\n"
+      "ld1rh { z17.h }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p5/Z, [x20]\n"
-      "fmin z8.h, p5/M, z8.h, z1.h\n"
-      "fmin z9.h, p5/M, z9.h, z1.h\n"
-      "fmin z10.h, p5/M, z10.h, z1.h\n"
-      "fmin z11.h, p5/M, z11.h, z1.h\n"
-      "fmax z8.h, p5/M, z8.h, z0.h\n"
-      "fmax z9.h, p5/M, z9.h, z0.h\n"
-      "fmax z10.h, p5/M, z10.h, z0.h\n"
-      "fmax z11.h, p5/M, z11.h, z0.h\n"
+      "ld1rh { z16.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z17.h\n"
+      "fmin z9.h, p5/M, z9.h, z17.h\n"
+      "fmin z10.h, p5/M, z10.h, z17.h\n"
+      "fmin z11.h, p5/M, z11.h, z17.h\n"
+      "fmax z8.h, p5/M, z8.h, z16.h\n"
+      "fmax z9.h, p5/M, z9.h, z16.h\n"
+      "fmax z10.h, p5/M, z10.h, z16.h\n"
+      "fmax z11.h, p5/M, z11.h, z16.h\n"
       "13:"  // Height 1: No activation
       "st1h { z8.h }, p4, [x13]\n"
       "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
@@ -443,15 +443,15 @@
       "18:"  // Height 2: no bias
       "tbz %x[flags], #0, 19f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #1\n"
+      "add x20, x13, x20, LSL #1\n"
       "ld1h { z8.h }, p4/Z, [x13]\n"
       "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
       "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1h { z12.h }, p4/Z, [x25]\n"
-      "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x20]\n"
+      "ld1h { z13.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x20, #3, MUL VL]\n"
       "b 20f\n"
       "19:"  // Height 2: no accumulate
       "mov z8.b, #0x0\n"
@@ -467,12 +467,12 @@
       "21:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 22f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 23f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -480,263 +480,263 @@
       "b 23f\n"
       "22:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
       "23:"  // Height 2: input setup done
       "cmp x27, #0x8\n"
       "ble 25f\n"
       "24:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z1.h[0]\n"
+      "fmla z12.h, z17.h, z0.h[0]\n"
+      "fmla z9.h, z16.h, z1.h[0]\n"
+      "fmla z13.h, z16.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z17.h, z1.h[0]\n"
+      "fmla z14.h, z17.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x12, #1, MUL VL]\n"
       "cmp x27, #0x8\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[0]\n"
+      "fmla z15.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[1]\n"
+      "fmla z12.h, z17.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #1, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[1]\n"
+      "fmla z13.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[1]\n"
+      "fmla z14.h, z17.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[1]\n"
+      "fmla z15.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[2]\n"
+      "fmla z12.h, z17.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[2]\n"
+      "fmla z13.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[2]\n"
+      "fmla z14.h, z17.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[2]\n"
+      "fmla z15.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[3]\n"
+      "fmla z12.h, z17.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[3]\n"
+      "fmla z13.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[3]\n"
+      "fmla z14.h, z17.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x12, #4, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[3]\n"
+      "fmla z15.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #4, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[4]\n"
+      "fmla z12.h, z17.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[4]\n"
+      "fmla z13.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[4]\n"
+      "fmla z14.h, z17.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x12, #5, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[4]\n"
+      "fmla z15.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #5, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[5]\n"
+      "fmla z12.h, z17.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[5]\n"
+      "fmla z13.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #5, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[5]\n"
+      "fmla z14.h, z17.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x12, #6, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[5]\n"
+      "fmla z15.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #6, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[6]\n"
+      "fmla z12.h, z17.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[6]\n"
+      "fmla z13.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #6, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[6]\n"
+      "fmla z14.h, z17.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x12, #7, MUL VL]\n"
       "addvl x12, x12, #8\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[6]\n"
+      "fmla z15.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x11, #7, MUL VL]\n"
       "addvl x11, x11, #8\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[7]\n"
+      "fmla z12.h, z17.h, z0.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[7]\n"
+      "fmla z13.h, z16.h, z0.h[7]\n"
+      "ld1h { z16.h }, p5/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #8\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z10.h, z17.h, z1.h[7]\n"
+      "fmla z14.h, z17.h, z0.h[7]\n"
+      "fmla z11.h, z16.h, z1.h[7]\n"
+      "fmla z15.h, z16.h, z0.h[7]\n"
       "bgt 24b\n"
       "25:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z0.h }, p0/Z, [x26]\n"
       "ld1rqh { z1.h }, p0/Z, [x25]\n"
       "subs x27, x27, #0x1\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[0]\n"
+      "fmla z12.h, z17.h, z1.h[0]\n"
+      "fmla z9.h, z16.h, z0.h[0]\n"
+      "fmla z13.h, z16.h, z1.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z17.h, z0.h[0]\n"
+      "fmla z14.h, z17.h, z1.h[0]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z11.h, z16.h, z0.h[0]\n"
+      "fmla z15.h, z16.h, z1.h[0]\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 26f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[1]\n"
+      "fmla z12.h, z17.h, z1.h[1]\n"
+      "fmla z9.h, z16.h, z0.h[1]\n"
+      "fmla z13.h, z16.h, z1.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z10.h, z17.h, z0.h[1]\n"
+      "fmla z14.h, z17.h, z1.h[1]\n"
       "addvl x12, x12, #1\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z11.h, z16.h, z0.h[1]\n"
+      "fmla z15.h, z16.h, z1.h[1]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 26f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[2]\n"
+      "fmla z12.h, z17.h, z1.h[2]\n"
+      "fmla z9.h, z16.h, z0.h[2]\n"
+      "fmla z13.h, z16.h, z1.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z10.h, z17.h, z0.h[2]\n"
+      "fmla z14.h, z17.h, z1.h[2]\n"
       "addvl x12, x12, #1\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z11.h, z16.h, z0.h[2]\n"
+      "fmla z15.h, z16.h, z1.h[2]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 26f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[3]\n"
+      "fmla z12.h, z17.h, z1.h[3]\n"
+      "fmla z9.h, z16.h, z0.h[3]\n"
+      "fmla z13.h, z16.h, z1.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z10.h, z17.h, z0.h[3]\n"
+      "fmla z14.h, z17.h, z1.h[3]\n"
       "addvl x12, x12, #1\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z11.h, z16.h, z0.h[3]\n"
+      "fmla z15.h, z16.h, z1.h[3]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 26f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[4]\n"
+      "fmla z12.h, z17.h, z1.h[4]\n"
+      "fmla z9.h, z16.h, z0.h[4]\n"
+      "fmla z13.h, z16.h, z1.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z10.h, z17.h, z0.h[4]\n"
+      "fmla z14.h, z17.h, z1.h[4]\n"
       "addvl x12, x12, #1\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z11.h, z16.h, z0.h[4]\n"
+      "fmla z15.h, z16.h, z1.h[4]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 26f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[5]\n"
+      "fmla z12.h, z17.h, z1.h[5]\n"
+      "fmla z9.h, z16.h, z0.h[5]\n"
+      "fmla z13.h, z16.h, z1.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z10.h, z17.h, z0.h[5]\n"
+      "fmla z14.h, z17.h, z1.h[5]\n"
       "addvl x12, x12, #1\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z11.h, z16.h, z0.h[5]\n"
+      "fmla z15.h, z16.h, z1.h[5]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 26f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[6]\n"
+      "fmla z12.h, z17.h, z1.h[6]\n"
+      "fmla z9.h, z16.h, z0.h[6]\n"
+      "fmla z13.h, z16.h, z1.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z10.h, z17.h, z0.h[6]\n"
+      "fmla z14.h, z17.h, z1.h[6]\n"
       "addvl x12, x12, #1\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z11.h, z16.h, z0.h[6]\n"
+      "fmla z15.h, z16.h, z1.h[6]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 26f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x12]\n"
+      "ld1h { z16.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z17.h, z0.h[7]\n"
+      "fmla z12.h, z17.h, z1.h[7]\n"
+      "fmla z9.h, z16.h, z0.h[7]\n"
+      "fmla z13.h, z16.h, z1.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z17.h, z0.h[7]\n"
+      "fmla z14.h, z17.h, z1.h[7]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z11.h, z16.h, z0.h[7]\n"
+      "fmla z15.h, z16.h, z1.h[7]\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "26:"  // Height 2: Multiply loop: multiply skip
@@ -748,25 +748,25 @@
       "add x25, x13, x20, LSL #1\n"
       "tbz %x[flags], #1, 27f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p5/Z, [x20]\n"
+      "ld1rh { z17.h }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p5/Z, [x20]\n"
-      "fmin z8.h, p5/M, z8.h, z1.h\n"
-      "fmin z9.h, p5/M, z9.h, z1.h\n"
-      "fmin z10.h, p5/M, z10.h, z1.h\n"
-      "fmin z11.h, p5/M, z11.h, z1.h\n"
-      "fmin z12.h, p5/M, z12.h, z1.h\n"
-      "fmin z13.h, p5/M, z13.h, z1.h\n"
-      "fmin z14.h, p5/M, z14.h, z1.h\n"
-      "fmin z15.h, p5/M, z15.h, z1.h\n"
-      "fmax z8.h, p5/M, z8.h, z0.h\n"
-      "fmax z9.h, p5/M, z9.h, z0.h\n"
-      "fmax z10.h, p5/M, z10.h, z0.h\n"
-      "fmax z11.h, p5/M, z11.h, z0.h\n"
-      "fmax z12.h, p5/M, z12.h, z0.h\n"
-      "fmax z13.h, p5/M, z13.h, z0.h\n"
-      "fmax z14.h, p5/M, z14.h, z0.h\n"
-      "fmax z15.h, p5/M, z15.h, z0.h\n"
+      "ld1rh { z16.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z17.h\n"
+      "fmin z9.h, p5/M, z9.h, z17.h\n"
+      "fmin z10.h, p5/M, z10.h, z17.h\n"
+      "fmin z11.h, p5/M, z11.h, z17.h\n"
+      "fmin z12.h, p5/M, z12.h, z17.h\n"
+      "fmin z13.h, p5/M, z13.h, z17.h\n"
+      "fmin z14.h, p5/M, z14.h, z17.h\n"
+      "fmin z15.h, p5/M, z15.h, z17.h\n"
+      "fmax z8.h, p5/M, z8.h, z16.h\n"
+      "fmax z9.h, p5/M, z9.h, z16.h\n"
+      "fmax z10.h, p5/M, z10.h, z16.h\n"
+      "fmax z11.h, p5/M, z11.h, z16.h\n"
+      "fmax z12.h, p5/M, z12.h, z16.h\n"
+      "fmax z13.h, p5/M, z13.h, z16.h\n"
+      "fmax z14.h, p5/M, z14.h, z16.h\n"
+      "fmax z15.h, p5/M, z15.h, z16.h\n"
       "27:"  // Height 2: No activation
       "st1h { z8.h }, p4, [x13]\n"
       "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
@@ -835,20 +835,20 @@
       "32:"  // Height 3: no bias
       "tbz %x[flags], #0, 33f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x21, x13, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z8.h }, p4/Z, [x13]\n"
       "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
       "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1h { z12.h }, p4/Z, [x25]\n"
-      "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p4/Z, [x24]\n"
-      "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x21]\n"
+      "ld1h { z13.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x20]\n"
+      "ld1h { z17.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x20, #3, MUL VL]\n"
       "b 34f\n"
       "33:"  // Height 3: no accumulate
       "mov z8.b, #0x0\n"
@@ -868,13 +868,13 @@
       "35:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 36f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 37f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -883,153 +883,153 @@
       "b 37f\n"
       "36:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "37:"  // Height 3: input setup done
       "cmp x27, #0x8\n"
       "ble 39f\n"
       "38:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
       "ld1rqh { z1.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
-      "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "ld1rqh { z0.h }, p0/Z, [x24]\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "fmla z8.h, z21.h, z2.h[0]\n"
+      "fmla z12.h, z21.h, z1.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z16.h, z21.h, z0.h[0]\n"
+      "fmla z9.h, z20.h, z2.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "fmla z13.h, z20.h, z1.h[0]\n"
+      "fmla z17.h, z20.h, z0.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
       "cmp x27, #0x8\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z10.h, z21.h, z2.h[0]\n"
+      "fmla z14.h, z21.h, z1.h[0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z18.h, z21.h, z0.h[0]\n"
+      "fmla z11.h, z20.h, z2.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x12, #1, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[0]\n"
+      "fmla z19.h, z20.h, z0.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[1]\n"
+      "fmla z12.h, z21.h, z1.h[1]\n"
+      "fmla z16.h, z21.h, z0.h[1]\n"
+      "fmla z9.h, z20.h, z2.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[1]\n"
+      "fmla z17.h, z20.h, z0.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[1]\n"
+      "fmla z14.h, z21.h, z1.h[1]\n"
+      "fmla z18.h, z21.h, z0.h[1]\n"
+      "fmla z11.h, z20.h, z2.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[1]\n"
+      "fmla z19.h, z20.h, z0.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[2]\n"
+      "fmla z12.h, z21.h, z1.h[2]\n"
+      "fmla z16.h, z21.h, z0.h[2]\n"
+      "fmla z9.h, z20.h, z2.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[2]\n"
+      "fmla z17.h, z20.h, z0.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[2]\n"
+      "fmla z14.h, z21.h, z1.h[2]\n"
+      "fmla z18.h, z21.h, z0.h[2]\n"
+      "fmla z11.h, z20.h, z2.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[2]\n"
+      "fmla z19.h, z20.h, z0.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[3]\n"
+      "fmla z12.h, z21.h, z1.h[3]\n"
+      "fmla z16.h, z21.h, z0.h[3]\n"
+      "fmla z9.h, z20.h, z2.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[3]\n"
+      "fmla z17.h, z20.h, z0.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[3]\n"
+      "fmla z14.h, z21.h, z1.h[3]\n"
+      "fmla z18.h, z21.h, z0.h[3]\n"
+      "fmla z11.h, z20.h, z2.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x12, #4, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[3]\n"
+      "fmla z19.h, z20.h, z0.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x11, #4, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[4]\n"
+      "fmla z12.h, z21.h, z1.h[4]\n"
+      "fmla z16.h, z21.h, z0.h[4]\n"
+      "fmla z9.h, z20.h, z2.h[4]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[4]\n"
+      "fmla z17.h, z20.h, z0.h[4]\n"
+      "ld1h { z20.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[4]\n"
+      "fmla z14.h, z21.h, z1.h[4]\n"
+      "fmla z18.h, z21.h, z0.h[4]\n"
+      "fmla z11.h, z20.h, z2.h[4]\n"
+      "ld1h { z21.h }, p5/Z, [x12, #5, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[4]\n"
+      "fmla z19.h, z20.h, z0.h[4]\n"
+      "ld1h { z20.h }, p5/Z, [x11, #5, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[5]\n"
+      "fmla z12.h, z21.h, z1.h[5]\n"
+      "fmla z16.h, z21.h, z0.h[5]\n"
+      "fmla z9.h, z20.h, z2.h[5]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[5]\n"
+      "fmla z17.h, z20.h, z0.h[5]\n"
+      "ld1h { z20.h }, p5/Z, [x9, #5, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[5]\n"
+      "fmla z14.h, z21.h, z1.h[5]\n"
+      "fmla z18.h, z21.h, z0.h[5]\n"
+      "fmla z11.h, z20.h, z2.h[5]\n"
+      "ld1h { z21.h }, p5/Z, [x12, #6, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[5]\n"
+      "fmla z19.h, z20.h, z0.h[5]\n"
+      "ld1h { z20.h }, p5/Z, [x11, #6, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[6]\n"
+      "fmla z12.h, z21.h, z1.h[6]\n"
+      "fmla z16.h, z21.h, z0.h[6]\n"
+      "fmla z9.h, z20.h, z2.h[6]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[6]\n"
+      "fmla z17.h, z20.h, z0.h[6]\n"
+      "ld1h { z20.h }, p5/Z, [x9, #6, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[6]\n"
+      "fmla z14.h, z21.h, z1.h[6]\n"
+      "fmla z18.h, z21.h, z0.h[6]\n"
+      "fmla z11.h, z20.h, z2.h[6]\n"
+      "ld1h { z21.h }, p5/Z, [x12, #7, MUL VL]\n"
       "addvl x12, x12, #8\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[6]\n"
+      "fmla z19.h, z20.h, z0.h[6]\n"
+      "ld1h { z20.h }, p5/Z, [x11, #7, MUL VL]\n"
       "addvl x11, x11, #8\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[7]\n"
+      "fmla z12.h, z21.h, z1.h[7]\n"
+      "fmla z16.h, z21.h, z0.h[7]\n"
+      "fmla z9.h, z20.h, z2.h[7]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[7]\n"
+      "fmla z17.h, z20.h, z0.h[7]\n"
+      "ld1h { z20.h }, p5/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #8\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z10.h, z21.h, z2.h[7]\n"
+      "fmla z14.h, z21.h, z1.h[7]\n"
+      "fmla z18.h, z21.h, z0.h[7]\n"
+      "fmla z11.h, z20.h, z2.h[7]\n"
+      "fmla z15.h, z20.h, z1.h[7]\n"
+      "fmla z19.h, z20.h, z0.h[7]\n"
       "bgt 38b\n"
       "39:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
@@ -1037,179 +1037,179 @@
       "ld1rqh { z1.h }, p0/Z, [x25]\n"
       "subs x27, x27, #0x1\n"
       "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "fmla z8.h, z21.h, z0.h[0]\n"
+      "fmla z12.h, z21.h, z1.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z16.h, z21.h, z2.h[0]\n"
+      "fmla z9.h, z20.h, z0.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "fmla z13.h, z20.h, z1.h[0]\n"
+      "fmla z17.h, z20.h, z2.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
       "addvl x12, x12, #1\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z10.h, z21.h, z0.h[0]\n"
+      "fmla z14.h, z21.h, z1.h[0]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z18.h, z21.h, z2.h[0]\n"
+      "fmla z11.h, z20.h, z0.h[0]\n"
       "addvl x9, x9, #1\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z15.h, z20.h, z1.h[0]\n"
+      "fmla z19.h, z20.h, z2.h[0]\n"
       "ble 40f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z21.h, z0.h[1]\n"
+      "fmla z12.h, z21.h, z1.h[1]\n"
+      "fmla z16.h, z21.h, z2.h[1]\n"
+      "fmla z9.h, z20.h, z0.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z13.h, z20.h, z1.h[1]\n"
+      "fmla z17.h, z20.h, z2.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
       "addvl x12, x12, #1\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z10.h, z21.h, z0.h[1]\n"
+      "fmla z14.h, z21.h, z1.h[1]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z18.h, z21.h, z2.h[1]\n"
+      "fmla z11.h, z20.h, z0.h[1]\n"
       "addvl x9, x9, #1\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z15.h, z20.h, z1.h[1]\n"
+      "fmla z19.h, z20.h, z2.h[1]\n"
       "ble 40f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z21.h, z0.h[2]\n"
+      "fmla z12.h, z21.h, z1.h[2]\n"
+      "fmla z16.h, z21.h, z2.h[2]\n"
+      "fmla z9.h, z20.h, z0.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z13.h, z20.h, z1.h[2]\n"
+      "fmla z17.h, z20.h, z2.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
       "addvl x12, x12, #1\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z10.h, z21.h, z0.h[2]\n"
+      "fmla z14.h, z21.h, z1.h[2]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z18.h, z21.h, z2.h[2]\n"
+      "fmla z11.h, z20.h, z0.h[2]\n"
       "addvl x9, x9, #1\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z15.h, z20.h, z1.h[2]\n"
+      "fmla z19.h, z20.h, z2.h[2]\n"
       "ble 40f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z21.h, z0.h[3]\n"
+      "fmla z12.h, z21.h, z1.h[3]\n"
+      "fmla z16.h, z21.h, z2.h[3]\n"
+      "fmla z9.h, z20.h, z0.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z13.h, z20.h, z1.h[3]\n"
+      "fmla z17.h, z20.h, z2.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
       "addvl x12, x12, #1\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z10.h, z21.h, z0.h[3]\n"
+      "fmla z14.h, z21.h, z1.h[3]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z18.h, z21.h, z2.h[3]\n"
+      "fmla z11.h, z20.h, z0.h[3]\n"
       "addvl x9, x9, #1\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z15.h, z20.h, z1.h[3]\n"
+      "fmla z19.h, z20.h, z2.h[3]\n"
       "ble 40f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z21.h, z0.h[4]\n"
+      "fmla z12.h, z21.h, z1.h[4]\n"
+      "fmla z16.h, z21.h, z2.h[4]\n"
+      "fmla z9.h, z20.h, z0.h[4]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z13.h, z20.h, z1.h[4]\n"
+      "fmla z17.h, z20.h, z2.h[4]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
       "addvl x12, x12, #1\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z10.h, z21.h, z0.h[4]\n"
+      "fmla z14.h, z21.h, z1.h[4]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z18.h, z21.h, z2.h[4]\n"
+      "fmla z11.h, z20.h, z0.h[4]\n"
       "addvl x9, x9, #1\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z15.h, z20.h, z1.h[4]\n"
+      "fmla z19.h, z20.h, z2.h[4]\n"
       "ble 40f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z21.h, z0.h[5]\n"
+      "fmla z12.h, z21.h, z1.h[5]\n"
+      "fmla z16.h, z21.h, z2.h[5]\n"
+      "fmla z9.h, z20.h, z0.h[5]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z13.h, z20.h, z1.h[5]\n"
+      "fmla z17.h, z20.h, z2.h[5]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
       "addvl x12, x12, #1\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z10.h, z21.h, z0.h[5]\n"
+      "fmla z14.h, z21.h, z1.h[5]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z18.h, z21.h, z2.h[5]\n"
+      "fmla z11.h, z20.h, z0.h[5]\n"
       "addvl x9, x9, #1\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z15.h, z20.h, z1.h[5]\n"
+      "fmla z19.h, z20.h, z2.h[5]\n"
       "ble 40f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z21.h, z0.h[6]\n"
+      "fmla z12.h, z21.h, z1.h[6]\n"
+      "fmla z16.h, z21.h, z2.h[6]\n"
+      "fmla z9.h, z20.h, z0.h[6]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z13.h, z20.h, z1.h[6]\n"
+      "fmla z17.h, z20.h, z2.h[6]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
       "addvl x12, x12, #1\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z10.h, z21.h, z0.h[6]\n"
+      "fmla z14.h, z21.h, z1.h[6]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z18.h, z21.h, z2.h[6]\n"
+      "fmla z11.h, z20.h, z0.h[6]\n"
       "addvl x9, x9, #1\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z15.h, z20.h, z1.h[6]\n"
+      "fmla z19.h, z20.h, z2.h[6]\n"
       "ble 40f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z21.h }, p5/Z, [x12]\n"
+      "ld1h { z20.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z21.h, z0.h[7]\n"
+      "fmla z12.h, z21.h, z1.h[7]\n"
+      "fmla z16.h, z21.h, z2.h[7]\n"
+      "fmla z9.h, z20.h, z0.h[7]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
       "addvl x12, x12, #1\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z13.h, z20.h, z1.h[7]\n"
+      "fmla z17.h, z20.h, z2.h[7]\n"
+      "ld1h { z20.h }, p5/Z, [x9]\n"
       "addvl x11, x11, #1\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z10.h, z21.h, z0.h[7]\n"
+      "fmla z14.h, z21.h, z1.h[7]\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z18.h, z21.h, z2.h[7]\n"
+      "fmla z11.h, z20.h, z0.h[7]\n"
+      "fmla z15.h, z20.h, z1.h[7]\n"
+      "fmla z19.h, z20.h, z2.h[7]\n"
       "40:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1220,33 +1220,33 @@
       "add x24, x25, x20, LSL #1\n"
       "tbz %x[flags], #1, 41f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p5/Z, [x20]\n"
+      "ld1rh { z21.h }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p5/Z, [x20]\n"
-      "fmin z8.h, p5/M, z8.h, z1.h\n"
-      "fmin z9.h, p5/M, z9.h, z1.h\n"
-      "fmin z10.h, p5/M, z10.h, z1.h\n"
-      "fmin z11.h, p5/M, z11.h, z1.h\n"
-      "fmin z12.h, p5/M, z12.h, z1.h\n"
-      "fmin z13.h, p5/M, z13.h, z1.h\n"
-      "fmin z14.h, p5/M, z14.h, z1.h\n"
-      "fmin z15.h, p5/M, z15.h, z1.h\n"
-      "fmin z16.h, p5/M, z16.h, z1.h\n"
-      "fmin z17.h, p5/M, z17.h, z1.h\n"
-      "fmin z18.h, p5/M, z18.h, z1.h\n"
-      "fmin z19.h, p5/M, z19.h, z1.h\n"
-      "fmax z8.h, p5/M, z8.h, z0.h\n"
-      "fmax z9.h, p5/M, z9.h, z0.h\n"
-      "fmax z10.h, p5/M, z10.h, z0.h\n"
-      "fmax z11.h, p5/M, z11.h, z0.h\n"
-      "fmax z12.h, p5/M, z12.h, z0.h\n"
-      "fmax z13.h, p5/M, z13.h, z0.h\n"
-      "fmax z14.h, p5/M, z14.h, z0.h\n"
-      "fmax z15.h, p5/M, z15.h, z0.h\n"
-      "fmax z16.h, p5/M, z16.h, z0.h\n"
-      "fmax z17.h, p5/M, z17.h, z0.h\n"
-      "fmax z18.h, p5/M, z18.h, z0.h\n"
-      "fmax z19.h, p5/M, z19.h, z0.h\n"
+      "ld1rh { z20.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z21.h\n"
+      "fmin z9.h, p5/M, z9.h, z21.h\n"
+      "fmin z10.h, p5/M, z10.h, z21.h\n"
+      "fmin z11.h, p5/M, z11.h, z21.h\n"
+      "fmin z12.h, p5/M, z12.h, z21.h\n"
+      "fmin z13.h, p5/M, z13.h, z21.h\n"
+      "fmin z14.h, p5/M, z14.h, z21.h\n"
+      "fmin z15.h, p5/M, z15.h, z21.h\n"
+      "fmin z16.h, p5/M, z16.h, z21.h\n"
+      "fmin z17.h, p5/M, z17.h, z21.h\n"
+      "fmin z18.h, p5/M, z18.h, z21.h\n"
+      "fmin z19.h, p5/M, z19.h, z21.h\n"
+      "fmax z8.h, p5/M, z8.h, z20.h\n"
+      "fmax z9.h, p5/M, z9.h, z20.h\n"
+      "fmax z10.h, p5/M, z10.h, z20.h\n"
+      "fmax z11.h, p5/M, z11.h, z20.h\n"
+      "fmax z12.h, p5/M, z12.h, z20.h\n"
+      "fmax z13.h, p5/M, z13.h, z20.h\n"
+      "fmax z14.h, p5/M, z14.h, z20.h\n"
+      "fmax z15.h, p5/M, z15.h, z20.h\n"
+      "fmax z16.h, p5/M, z16.h, z20.h\n"
+      "fmax z17.h, p5/M, z17.h, z20.h\n"
+      "fmax z18.h, p5/M, z18.h, z20.h\n"
+      "fmax z19.h, p5/M, z19.h, z20.h\n"
       "41:"  // Height 3: No activation
       "st1h { z8.h }, p4, [x13]\n"
       "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
@@ -1323,25 +1323,25 @@
       "46:"  // Height 4: no bias
       "tbz %x[flags], #0, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x22, x13, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
       "ld1h { z8.h }, p4/Z, [x13]\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
       "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1h { z12.h }, p4/Z, [x25]\n"
-      "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p4/Z, [x24]\n"
-      "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z20.h }, p4/Z, [x23]\n"
-      "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x22]\n"
+      "ld1h { z13.h }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x21]\n"
+      "ld1h { z17.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x20]\n"
+      "ld1h { z21.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x20, #3, MUL VL]\n"
       "b 48f\n"
       "47:"  // Height 4: no accumulate
       "mov z8.b, #0x0\n"
@@ -1365,14 +1365,14 @@
       "49:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 50f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 51f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1382,188 +1382,188 @@
       "b 51f\n"
       "50:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "51:"  // Height 4: input setup done
       "cmp x27, #0x8\n"
       "ble 53f\n"
       "52:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z3.h }, p0/Z, [x26]\n"
+      "ld1rqh { z2.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
-      "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z1.h }, p0/Z, [x24]\n"
+      "ld1rqh { z0.h }, p0/Z, [x23]\n"
       "cmp x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z3.h[0]\n"
+      "fmla z12.h, z25.h, z2.h[0]\n"
+      "fmla z16.h, z25.h, z1.h[0]\n"
+      "fmla z20.h, z25.h, z0.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
+      "fmla z9.h, z24.h, z3.h[0]\n"
+      "fmla z13.h, z24.h, z2.h[0]\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n"
+      "fmla z17.h, z24.h, z1.h[0]\n"
+      "fmla z21.h, z24.h, z0.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z25.h, z3.h[0]\n"
+      "fmla z14.h, z25.h, z2.h[0]\n"
+      "fmla z18.h, z25.h, z1.h[0]\n"
+      "fmla z22.h, z25.h, z0.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[0]\n"
+      "fmla z15.h, z24.h, z2.h[0]\n"
+      "fmla z19.h, z24.h, z1.h[0]\n"
+      "fmla z23.h, z24.h, z0.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[1]\n"
+      "fmla z12.h, z25.h, z2.h[1]\n"
+      "fmla z16.h, z25.h, z1.h[1]\n"
+      "fmla z20.h, z25.h, z0.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[1]\n"
+      "fmla z13.h, z24.h, z2.h[1]\n"
+      "fmla z17.h, z24.h, z1.h[1]\n"
+      "fmla z21.h, z24.h, z0.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[1]\n"
+      "fmla z14.h, z25.h, z2.h[1]\n"
+      "fmla z18.h, z25.h, z1.h[1]\n"
+      "fmla z22.h, z25.h, z0.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[1]\n"
+      "fmla z15.h, z24.h, z2.h[1]\n"
+      "fmla z19.h, z24.h, z1.h[1]\n"
+      "fmla z23.h, z24.h, z0.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[2]\n"
+      "fmla z12.h, z25.h, z2.h[2]\n"
+      "fmla z16.h, z25.h, z1.h[2]\n"
+      "fmla z20.h, z25.h, z0.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[2]\n"
+      "fmla z13.h, z24.h, z2.h[2]\n"
+      "fmla z17.h, z24.h, z1.h[2]\n"
+      "fmla z21.h, z24.h, z0.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[2]\n"
+      "fmla z14.h, z25.h, z2.h[2]\n"
+      "fmla z18.h, z25.h, z1.h[2]\n"
+      "fmla z22.h, z25.h, z0.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[2]\n"
+      "fmla z15.h, z24.h, z2.h[2]\n"
+      "fmla z19.h, z24.h, z1.h[2]\n"
+      "fmla z23.h, z24.h, z0.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[3]\n"
+      "fmla z12.h, z25.h, z2.h[3]\n"
+      "fmla z16.h, z25.h, z1.h[3]\n"
+      "fmla z20.h, z25.h, z0.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[3]\n"
+      "fmla z13.h, z24.h, z2.h[3]\n"
+      "fmla z17.h, z24.h, z1.h[3]\n"
+      "fmla z21.h, z24.h, z0.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[3]\n"
+      "fmla z14.h, z25.h, z2.h[3]\n"
+      "fmla z18.h, z25.h, z1.h[3]\n"
+      "fmla z22.h, z25.h, z0.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x12, #4, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[3]\n"
+      "fmla z15.h, z24.h, z2.h[3]\n"
+      "fmla z19.h, z24.h, z1.h[3]\n"
+      "fmla z23.h, z24.h, z0.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #4, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[4]\n"
+      "fmla z12.h, z25.h, z2.h[4]\n"
+      "fmla z16.h, z25.h, z1.h[4]\n"
+      "fmla z20.h, z25.h, z0.h[4]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[4]\n"
+      "fmla z13.h, z24.h, z2.h[4]\n"
+      "fmla z17.h, z24.h, z1.h[4]\n"
+      "fmla z21.h, z24.h, z0.h[4]\n"
+      "ld1h { z24.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[4]\n"
+      "fmla z14.h, z25.h, z2.h[4]\n"
+      "fmla z18.h, z25.h, z1.h[4]\n"
+      "fmla z22.h, z25.h, z0.h[4]\n"
+      "ld1h { z25.h }, p5/Z, [x12, #5, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[4]\n"
+      "fmla z15.h, z24.h, z2.h[4]\n"
+      "fmla z19.h, z24.h, z1.h[4]\n"
+      "fmla z23.h, z24.h, z0.h[4]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #5, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[5]\n"
+      "fmla z12.h, z25.h, z2.h[5]\n"
+      "fmla z16.h, z25.h, z1.h[5]\n"
+      "fmla z20.h, z25.h, z0.h[5]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[5]\n"
+      "fmla z13.h, z24.h, z2.h[5]\n"
+      "fmla z17.h, z24.h, z1.h[5]\n"
+      "fmla z21.h, z24.h, z0.h[5]\n"
+      "ld1h { z24.h }, p5/Z, [x9, #5, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[5]\n"
+      "fmla z14.h, z25.h, z2.h[5]\n"
+      "fmla z18.h, z25.h, z1.h[5]\n"
+      "fmla z22.h, z25.h, z0.h[5]\n"
+      "ld1h { z25.h }, p5/Z, [x12, #6, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[5]\n"
+      "fmla z15.h, z24.h, z2.h[5]\n"
+      "fmla z19.h, z24.h, z1.h[5]\n"
+      "fmla z23.h, z24.h, z0.h[5]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #6, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[6]\n"
+      "fmla z12.h, z25.h, z2.h[6]\n"
+      "fmla z16.h, z25.h, z1.h[6]\n"
+      "fmla z20.h, z25.h, z0.h[6]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[6]\n"
+      "fmla z13.h, z24.h, z2.h[6]\n"
+      "fmla z17.h, z24.h, z1.h[6]\n"
+      "fmla z21.h, z24.h, z0.h[6]\n"
+      "ld1h { z24.h }, p5/Z, [x9, #6, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[6]\n"
+      "fmla z14.h, z25.h, z2.h[6]\n"
+      "fmla z18.h, z25.h, z1.h[6]\n"
+      "fmla z22.h, z25.h, z0.h[6]\n"
+      "ld1h { z25.h }, p5/Z, [x12, #7, MUL VL]\n"
       "addvl x12, x12, #8\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[6]\n"
+      "fmla z15.h, z24.h, z2.h[6]\n"
+      "fmla z19.h, z24.h, z1.h[6]\n"
+      "fmla z23.h, z24.h, z0.h[6]\n"
+      "ld1h { z24.h }, p5/Z, [x11, #7, MUL VL]\n"
       "addvl x11, x11, #8\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[7]\n"
+      "fmla z12.h, z25.h, z2.h[7]\n"
+      "fmla z16.h, z25.h, z1.h[7]\n"
+      "fmla z20.h, z25.h, z0.h[7]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[7]\n"
+      "fmla z13.h, z24.h, z2.h[7]\n"
+      "fmla z17.h, z24.h, z1.h[7]\n"
+      "fmla z21.h, z24.h, z0.h[7]\n"
+      "ld1h { z24.h }, p5/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #8\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
+      "fmla z10.h, z25.h, z3.h[7]\n"
+      "fmla z14.h, z25.h, z2.h[7]\n"
+      "fmla z18.h, z25.h, z1.h[7]\n"
+      "fmla z22.h, z25.h, z0.h[7]\n"
+      "fmla z11.h, z24.h, z3.h[7]\n"
+      "fmla z15.h, z24.h, z2.h[7]\n"
+      "fmla z19.h, z24.h, z1.h[7]\n"
+      "fmla z23.h, z24.h, z0.h[7]\n"
       "bgt 52b\n"
       "53:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
@@ -1572,211 +1572,211 @@
       "subs x27, x27, #0x1\n"
       "ld1rqh { z2.h }, p0/Z, [x24]\n"
       "ld1rqh { z3.h }, p0/Z, [x23]\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[0]\n"
+      "fmla z12.h, z25.h, z1.h[0]\n"
+      "fmla z16.h, z25.h, z2.h[0]\n"
+      "fmla z20.h, z25.h, z3.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
       "addvl x12, x12, #1\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
+      "fmla z9.h, z24.h, z0.h[0]\n"
+      "fmla z13.h, z24.h, z1.h[0]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z17.h, z24.h, z2.h[0]\n"
+      "fmla z21.h, z24.h, z3.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
+      "fmla z10.h, z25.h, z0.h[0]\n"
+      "fmla z14.h, z25.h, z1.h[0]\n"
+      "fmla z18.h, z25.h, z2.h[0]\n"
+      "fmla z22.h, z25.h, z3.h[0]\n"
+      "fmla z11.h, z24.h, z0.h[0]\n"
+      "fmla z15.h, z24.h, z1.h[0]\n"
+      "fmla z19.h, z24.h, z2.h[0]\n"
+      "fmla z23.h, z24.h, z3.h[0]\n"
       "ble 54f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[1]\n"
+      "fmla z12.h, z25.h, z1.h[1]\n"
+      "fmla z16.h, z25.h, z2.h[1]\n"
+      "fmla z20.h, z25.h, z3.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z9.h, z24.h, z0.h[1]\n"
+      "fmla z13.h, z24.h, z1.h[1]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z17.h, z24.h, z2.h[1]\n"
+      "fmla z21.h, z24.h, z3.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
       "addvl x10, x10, #1\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z10.h, z25.h, z0.h[1]\n"
+      "fmla z14.h, z25.h, z1.h[1]\n"
       "addvl x9, x9, #1\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
+      "fmla z18.h, z25.h, z2.h[1]\n"
+      "fmla z22.h, z25.h, z3.h[1]\n"
+      "fmla z11.h, z24.h, z0.h[1]\n"
+      "fmla z15.h, z24.h, z1.h[1]\n"
+      "fmla z19.h, z24.h, z2.h[1]\n"
+      "fmla z23.h, z24.h, z3.h[1]\n"
       "ble 54f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[2]\n"
+      "fmla z12.h, z25.h, z1.h[2]\n"
+      "fmla z16.h, z25.h, z2.h[2]\n"
+      "fmla z20.h, z25.h, z3.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z9.h, z24.h, z0.h[2]\n"
+      "fmla z13.h, z24.h, z1.h[2]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z17.h, z24.h, z2.h[2]\n"
+      "fmla z21.h, z24.h, z3.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
       "addvl x10, x10, #1\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z10.h, z25.h, z0.h[2]\n"
+      "fmla z14.h, z25.h, z1.h[2]\n"
       "addvl x9, x9, #1\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
+      "fmla z18.h, z25.h, z2.h[2]\n"
+      "fmla z22.h, z25.h, z3.h[2]\n"
+      "fmla z11.h, z24.h, z0.h[2]\n"
+      "fmla z15.h, z24.h, z1.h[2]\n"
+      "fmla z19.h, z24.h, z2.h[2]\n"
+      "fmla z23.h, z24.h, z3.h[2]\n"
       "ble 54f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[3]\n"
+      "fmla z12.h, z25.h, z1.h[3]\n"
+      "fmla z16.h, z25.h, z2.h[3]\n"
+      "fmla z20.h, z25.h, z3.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z9.h, z24.h, z0.h[3]\n"
+      "fmla z13.h, z24.h, z1.h[3]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z17.h, z24.h, z2.h[3]\n"
+      "fmla z21.h, z24.h, z3.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
       "addvl x10, x10, #1\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z10.h, z25.h, z0.h[3]\n"
+      "fmla z14.h, z25.h, z1.h[3]\n"
       "addvl x9, x9, #1\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
+      "fmla z18.h, z25.h, z2.h[3]\n"
+      "fmla z22.h, z25.h, z3.h[3]\n"
+      "fmla z11.h, z24.h, z0.h[3]\n"
+      "fmla z15.h, z24.h, z1.h[3]\n"
+      "fmla z19.h, z24.h, z2.h[3]\n"
+      "fmla z23.h, z24.h, z3.h[3]\n"
       "ble 54f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[4]\n"
+      "fmla z12.h, z25.h, z1.h[4]\n"
+      "fmla z16.h, z25.h, z2.h[4]\n"
+      "fmla z20.h, z25.h, z3.h[4]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z9.h, z24.h, z0.h[4]\n"
+      "fmla z13.h, z24.h, z1.h[4]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z17.h, z24.h, z2.h[4]\n"
+      "fmla z21.h, z24.h, z3.h[4]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
       "addvl x10, x10, #1\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z10.h, z25.h, z0.h[4]\n"
+      "fmla z14.h, z25.h, z1.h[4]\n"
       "addvl x9, x9, #1\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
+      "fmla z18.h, z25.h, z2.h[4]\n"
+      "fmla z22.h, z25.h, z3.h[4]\n"
+      "fmla z11.h, z24.h, z0.h[4]\n"
+      "fmla z15.h, z24.h, z1.h[4]\n"
+      "fmla z19.h, z24.h, z2.h[4]\n"
+      "fmla z23.h, z24.h, z3.h[4]\n"
       "ble 54f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[5]\n"
+      "fmla z12.h, z25.h, z1.h[5]\n"
+      "fmla z16.h, z25.h, z2.h[5]\n"
+      "fmla z20.h, z25.h, z3.h[5]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z9.h, z24.h, z0.h[5]\n"
+      "fmla z13.h, z24.h, z1.h[5]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z17.h, z24.h, z2.h[5]\n"
+      "fmla z21.h, z24.h, z3.h[5]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
       "addvl x10, x10, #1\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z10.h, z25.h, z0.h[5]\n"
+      "fmla z14.h, z25.h, z1.h[5]\n"
       "addvl x9, x9, #1\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
+      "fmla z18.h, z25.h, z2.h[5]\n"
+      "fmla z22.h, z25.h, z3.h[5]\n"
+      "fmla z11.h, z24.h, z0.h[5]\n"
+      "fmla z15.h, z24.h, z1.h[5]\n"
+      "fmla z19.h, z24.h, z2.h[5]\n"
+      "fmla z23.h, z24.h, z3.h[5]\n"
       "ble 54f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[6]\n"
+      "fmla z12.h, z25.h, z1.h[6]\n"
+      "fmla z16.h, z25.h, z2.h[6]\n"
+      "fmla z20.h, z25.h, z3.h[6]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z9.h, z24.h, z0.h[6]\n"
+      "fmla z13.h, z24.h, z1.h[6]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z17.h, z24.h, z2.h[6]\n"
+      "fmla z21.h, z24.h, z3.h[6]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
       "addvl x10, x10, #1\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z10.h, z25.h, z0.h[6]\n"
+      "fmla z14.h, z25.h, z1.h[6]\n"
       "addvl x9, x9, #1\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
+      "fmla z18.h, z25.h, z2.h[6]\n"
+      "fmla z22.h, z25.h, z3.h[6]\n"
+      "fmla z11.h, z24.h, z0.h[6]\n"
+      "fmla z15.h, z24.h, z1.h[6]\n"
+      "fmla z19.h, z24.h, z2.h[6]\n"
+      "fmla z23.h, z24.h, z3.h[6]\n"
       "ble 54f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "ld1h { z25.h }, p5/Z, [x12]\n"
+      "ld1h { z24.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z25.h, z0.h[7]\n"
+      "fmla z12.h, z25.h, z1.h[7]\n"
+      "fmla z16.h, z25.h, z2.h[7]\n"
+      "fmla z20.h, z25.h, z3.h[7]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
       "addvl x12, x12, #1\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z9.h, z24.h, z0.h[7]\n"
+      "fmla z13.h, z24.h, z1.h[7]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z17.h, z24.h, z2.h[7]\n"
+      "fmla z21.h, z24.h, z3.h[7]\n"
+      "ld1h { z24.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
+      "fmla z10.h, z25.h, z0.h[7]\n"
+      "fmla z14.h, z25.h, z1.h[7]\n"
+      "fmla z18.h, z25.h, z2.h[7]\n"
+      "fmla z22.h, z25.h, z3.h[7]\n"
+      "fmla z11.h, z24.h, z0.h[7]\n"
+      "fmla z15.h, z24.h, z1.h[7]\n"
+      "fmla z19.h, z24.h, z2.h[7]\n"
+      "fmla z23.h, z24.h, z3.h[7]\n"
       "54:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1788,41 +1788,41 @@
       "add x23, x24, x20, LSL #1\n"
       "tbz %x[flags], #1, 55f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p5/Z, [x20]\n"
+      "ld1rh { z25.h }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p5/Z, [x20]\n"
-      "fmin z8.h, p5/M, z8.h, z1.h\n"
-      "fmin z9.h, p5/M, z9.h, z1.h\n"
-      "fmin z10.h, p5/M, z10.h, z1.h\n"
-      "fmin z11.h, p5/M, z11.h, z1.h\n"
-      "fmin z12.h, p5/M, z12.h, z1.h\n"
-      "fmin z13.h, p5/M, z13.h, z1.h\n"
-      "fmin z14.h, p5/M, z14.h, z1.h\n"
-      "fmin z15.h, p5/M, z15.h, z1.h\n"
-      "fmin z16.h, p5/M, z16.h, z1.h\n"
-      "fmin z17.h, p5/M, z17.h, z1.h\n"
-      "fmin z18.h, p5/M, z18.h, z1.h\n"
-      "fmin z19.h, p5/M, z19.h, z1.h\n"
-      "fmin z20.h, p5/M, z20.h, z1.h\n"
-      "fmin z21.h, p5/M, z21.h, z1.h\n"
-      "fmin z22.h, p5/M, z22.h, z1.h\n"
-      "fmin z23.h, p5/M, z23.h, z1.h\n"
-      "fmax z8.h, p5/M, z8.h, z0.h\n"
-      "fmax z9.h, p5/M, z9.h, z0.h\n"
-      "fmax z10.h, p5/M, z10.h, z0.h\n"
-      "fmax z11.h, p5/M, z11.h, z0.h\n"
-      "fmax z12.h, p5/M, z12.h, z0.h\n"
-      "fmax z13.h, p5/M, z13.h, z0.h\n"
-      "fmax z14.h, p5/M, z14.h, z0.h\n"
-      "fmax z15.h, p5/M, z15.h, z0.h\n"
-      "fmax z16.h, p5/M, z16.h, z0.h\n"
-      "fmax z17.h, p5/M, z17.h, z0.h\n"
-      "fmax z18.h, p5/M, z18.h, z0.h\n"
-      "fmax z19.h, p5/M, z19.h, z0.h\n"
-      "fmax z20.h, p5/M, z20.h, z0.h\n"
-      "fmax z21.h, p5/M, z21.h, z0.h\n"
-      "fmax z22.h, p5/M, z22.h, z0.h\n"
-      "fmax z23.h, p5/M, z23.h, z0.h\n"
+      "ld1rh { z24.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z25.h\n"
+      "fmin z9.h, p5/M, z9.h, z25.h\n"
+      "fmin z10.h, p5/M, z10.h, z25.h\n"
+      "fmin z11.h, p5/M, z11.h, z25.h\n"
+      "fmin z12.h, p5/M, z12.h, z25.h\n"
+      "fmin z13.h, p5/M, z13.h, z25.h\n"
+      "fmin z14.h, p5/M, z14.h, z25.h\n"
+      "fmin z15.h, p5/M, z15.h, z25.h\n"
+      "fmin z16.h, p5/M, z16.h, z25.h\n"
+      "fmin z17.h, p5/M, z17.h, z25.h\n"
+      "fmin z18.h, p5/M, z18.h, z25.h\n"
+      "fmin z19.h, p5/M, z19.h, z25.h\n"
+      "fmin z20.h, p5/M, z20.h, z25.h\n"
+      "fmin z21.h, p5/M, z21.h, z25.h\n"
+      "fmin z22.h, p5/M, z22.h, z25.h\n"
+      "fmin z23.h, p5/M, z23.h, z25.h\n"
+      "fmax z8.h, p5/M, z8.h, z24.h\n"
+      "fmax z9.h, p5/M, z9.h, z24.h\n"
+      "fmax z10.h, p5/M, z10.h, z24.h\n"
+      "fmax z11.h, p5/M, z11.h, z24.h\n"
+      "fmax z12.h, p5/M, z12.h, z24.h\n"
+      "fmax z13.h, p5/M, z13.h, z24.h\n"
+      "fmax z14.h, p5/M, z14.h, z24.h\n"
+      "fmax z15.h, p5/M, z15.h, z24.h\n"
+      "fmax z16.h, p5/M, z16.h, z24.h\n"
+      "fmax z17.h, p5/M, z17.h, z24.h\n"
+      "fmax z18.h, p5/M, z18.h, z24.h\n"
+      "fmax z19.h, p5/M, z19.h, z24.h\n"
+      "fmax z20.h, p5/M, z20.h, z24.h\n"
+      "fmax z21.h, p5/M, z21.h, z24.h\n"
+      "fmax z22.h, p5/M, z22.h, z24.h\n"
+      "fmax z23.h, p5/M, z23.h, z24.h\n"
       "55:"  // Height 4: No activation
       "st1h { z8.h }, p4, [x13]\n"
       "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
@@ -1907,30 +1907,30 @@
       "60:"  // Height 5: no bias
       "tbz %x[flags], #0, 61f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "ld1h { z8.h }, p4/Z, [x13]\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x23, x13, x20, LSL #1\n"
       "add x22, x23, x20, LSL #1\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
       "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1h { z12.h }, p4/Z, [x25]\n"
-      "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p4/Z, [x24]\n"
-      "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z20.h }, p4/Z, [x23]\n"
-      "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1h { z24.h }, p4/Z, [x22]\n"
-      "ld1h { z25.h }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1h { z26.h }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x23]\n"
+      "ld1h { z13.h }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x22]\n"
+      "ld1h { z17.h }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x21]\n"
+      "ld1h { z21.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x20]\n"
+      "ld1h { z25.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z26.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z27.h }, p1/Z, [x20, #3, MUL VL]\n"
       "b 62f\n"
       "61:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -1958,15 +1958,15 @@
       "63:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 64f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 65f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1977,223 +1977,223 @@
       "b 65f\n"
       "64:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "65:"  // Height 5: input setup done
       "cmp x27, #0x8\n"
       "ble 67f\n"
       "66:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z4.h }, p0/Z, [x26]\n"
+      "ld1rqh { z3.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
       "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z1.h }, p0/Z, [x23]\n"
       "cmp x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      "ld1rqh { z4.h }, p0/Z, [x22]\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
+      "ld1rqh { z0.h }, p0/Z, [x22]\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "fmla z8.h, z29.h, z4.h[0]\n"
+      "fmla z12.h, z29.h, z3.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z16.h, z29.h, z2.h[0]\n"
+      "fmla z20.h, z29.h, z1.h[0]\n"
       "add x25, x25, #0x10\n"
-      "fmla z24.h, z6.h, z4.h[0]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z29.h, z0.h[0]\n"
+      "fmla z9.h, z28.h, z4.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
       "add x24, x24, #0x10\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
+      "fmla z13.h, z28.h, z3.h[0]\n"
+      "fmla z17.h, z28.h, z2.h[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "fmla z25.h, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z26.h, z6.h, z4.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "fmla z27.h, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
-      "fmla z24.h, z6.h, z4.h[1]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "fmla z25.h, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z26.h, z6.h, z4.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "fmla z27.h, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
-      "fmla z24.h, z6.h, z4.h[2]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "fmla z25.h, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z26.h, z6.h, z4.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "fmla z27.h, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
-      "fmla z24.h, z6.h, z4.h[3]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "fmla z25.h, z7.h, z4.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z26.h, z6.h, z4.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "fmla z27.h, z7.h, z4.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
-      "fmla z24.h, z6.h, z4.h[4]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "fmla z25.h, z7.h, z4.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z26.h, z6.h, z4.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "fmla z27.h, z7.h, z4.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
-      "fmla z24.h, z6.h, z4.h[5]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "fmla z25.h, z7.h, z4.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z26.h, z6.h, z4.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "fmla z27.h, z7.h, z4.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
-      "fmla z24.h, z6.h, z4.h[6]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "fmla z25.h, z7.h, z4.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z26.h, z6.h, z4.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n"
+      "fmla z21.h, z28.h, z1.h[0]\n"
+      "fmla z25.h, z28.h, z0.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z29.h, z4.h[0]\n"
+      "fmla z14.h, z29.h, z3.h[0]\n"
+      "fmla z18.h, z29.h, z2.h[0]\n"
+      "fmla z22.h, z29.h, z1.h[0]\n"
+      "fmla z26.h, z29.h, z0.h[0]\n"
+      "fmla z11.h, z28.h, z4.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[0]\n"
+      "fmla z19.h, z28.h, z2.h[0]\n"
+      "fmla z23.h, z28.h, z1.h[0]\n"
+      "fmla z27.h, z28.h, z0.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[1]\n"
+      "fmla z12.h, z29.h, z3.h[1]\n"
+      "fmla z16.h, z29.h, z2.h[1]\n"
+      "fmla z20.h, z29.h, z1.h[1]\n"
+      "fmla z24.h, z29.h, z0.h[1]\n"
+      "fmla z9.h, z28.h, z4.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[1]\n"
+      "fmla z17.h, z28.h, z2.h[1]\n"
+      "fmla z21.h, z28.h, z1.h[1]\n"
+      "fmla z25.h, z28.h, z0.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[1]\n"
+      "fmla z14.h, z29.h, z3.h[1]\n"
+      "fmla z18.h, z29.h, z2.h[1]\n"
+      "fmla z22.h, z29.h, z1.h[1]\n"
+      "fmla z26.h, z29.h, z0.h[1]\n"
+      "fmla z11.h, z28.h, z4.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[1]\n"
+      "fmla z19.h, z28.h, z2.h[1]\n"
+      "fmla z23.h, z28.h, z1.h[1]\n"
+      "fmla z27.h, z28.h, z0.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[2]\n"
+      "fmla z12.h, z29.h, z3.h[2]\n"
+      "fmla z16.h, z29.h, z2.h[2]\n"
+      "fmla z20.h, z29.h, z1.h[2]\n"
+      "fmla z24.h, z29.h, z0.h[2]\n"
+      "fmla z9.h, z28.h, z4.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[2]\n"
+      "fmla z17.h, z28.h, z2.h[2]\n"
+      "fmla z21.h, z28.h, z1.h[2]\n"
+      "fmla z25.h, z28.h, z0.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[2]\n"
+      "fmla z14.h, z29.h, z3.h[2]\n"
+      "fmla z18.h, z29.h, z2.h[2]\n"
+      "fmla z22.h, z29.h, z1.h[2]\n"
+      "fmla z26.h, z29.h, z0.h[2]\n"
+      "fmla z11.h, z28.h, z4.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[2]\n"
+      "fmla z19.h, z28.h, z2.h[2]\n"
+      "fmla z23.h, z28.h, z1.h[2]\n"
+      "fmla z27.h, z28.h, z0.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[3]\n"
+      "fmla z12.h, z29.h, z3.h[3]\n"
+      "fmla z16.h, z29.h, z2.h[3]\n"
+      "fmla z20.h, z29.h, z1.h[3]\n"
+      "fmla z24.h, z29.h, z0.h[3]\n"
+      "fmla z9.h, z28.h, z4.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[3]\n"
+      "fmla z17.h, z28.h, z2.h[3]\n"
+      "fmla z21.h, z28.h, z1.h[3]\n"
+      "fmla z25.h, z28.h, z0.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[3]\n"
+      "fmla z14.h, z29.h, z3.h[3]\n"
+      "fmla z18.h, z29.h, z2.h[3]\n"
+      "fmla z22.h, z29.h, z1.h[3]\n"
+      "fmla z26.h, z29.h, z0.h[3]\n"
+      "fmla z11.h, z28.h, z4.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x12, #4, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[3]\n"
+      "fmla z19.h, z28.h, z2.h[3]\n"
+      "fmla z23.h, z28.h, z1.h[3]\n"
+      "fmla z27.h, z28.h, z0.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x11, #4, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[4]\n"
+      "fmla z12.h, z29.h, z3.h[4]\n"
+      "fmla z16.h, z29.h, z2.h[4]\n"
+      "fmla z20.h, z29.h, z1.h[4]\n"
+      "fmla z24.h, z29.h, z0.h[4]\n"
+      "fmla z9.h, z28.h, z4.h[4]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[4]\n"
+      "fmla z17.h, z28.h, z2.h[4]\n"
+      "fmla z21.h, z28.h, z1.h[4]\n"
+      "fmla z25.h, z28.h, z0.h[4]\n"
+      "ld1h { z28.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[4]\n"
+      "fmla z14.h, z29.h, z3.h[4]\n"
+      "fmla z18.h, z29.h, z2.h[4]\n"
+      "fmla z22.h, z29.h, z1.h[4]\n"
+      "fmla z26.h, z29.h, z0.h[4]\n"
+      "fmla z11.h, z28.h, z4.h[4]\n"
+      "ld1h { z29.h }, p5/Z, [x12, #5, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[4]\n"
+      "fmla z19.h, z28.h, z2.h[4]\n"
+      "fmla z23.h, z28.h, z1.h[4]\n"
+      "fmla z27.h, z28.h, z0.h[4]\n"
+      "ld1h { z28.h }, p5/Z, [x11, #5, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[5]\n"
+      "fmla z12.h, z29.h, z3.h[5]\n"
+      "fmla z16.h, z29.h, z2.h[5]\n"
+      "fmla z20.h, z29.h, z1.h[5]\n"
+      "fmla z24.h, z29.h, z0.h[5]\n"
+      "fmla z9.h, z28.h, z4.h[5]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[5]\n"
+      "fmla z17.h, z28.h, z2.h[5]\n"
+      "fmla z21.h, z28.h, z1.h[5]\n"
+      "fmla z25.h, z28.h, z0.h[5]\n"
+      "ld1h { z28.h }, p5/Z, [x9, #5, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[5]\n"
+      "fmla z14.h, z29.h, z3.h[5]\n"
+      "fmla z18.h, z29.h, z2.h[5]\n"
+      "fmla z22.h, z29.h, z1.h[5]\n"
+      "fmla z26.h, z29.h, z0.h[5]\n"
+      "fmla z11.h, z28.h, z4.h[5]\n"
+      "ld1h { z29.h }, p5/Z, [x12, #6, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[5]\n"
+      "fmla z19.h, z28.h, z2.h[5]\n"
+      "fmla z23.h, z28.h, z1.h[5]\n"
+      "fmla z27.h, z28.h, z0.h[5]\n"
+      "ld1h { z28.h }, p5/Z, [x11, #6, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[6]\n"
+      "fmla z12.h, z29.h, z3.h[6]\n"
+      "fmla z16.h, z29.h, z2.h[6]\n"
+      "fmla z20.h, z29.h, z1.h[6]\n"
+      "fmla z24.h, z29.h, z0.h[6]\n"
+      "fmla z9.h, z28.h, z4.h[6]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[6]\n"
+      "fmla z17.h, z28.h, z2.h[6]\n"
+      "fmla z21.h, z28.h, z1.h[6]\n"
+      "fmla z25.h, z28.h, z0.h[6]\n"
+      "ld1h { z28.h }, p5/Z, [x9, #6, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[6]\n"
+      "fmla z14.h, z29.h, z3.h[6]\n"
+      "fmla z18.h, z29.h, z2.h[6]\n"
+      "fmla z22.h, z29.h, z1.h[6]\n"
+      "fmla z26.h, z29.h, z0.h[6]\n"
+      "fmla z11.h, z28.h, z4.h[6]\n"
+      "ld1h { z29.h }, p5/Z, [x12, #7, MUL VL]\n"
       "addvl x12, x12, #8\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "fmla z27.h, z7.h, z4.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[6]\n"
+      "fmla z19.h, z28.h, z2.h[6]\n"
+      "fmla z23.h, z28.h, z1.h[6]\n"
+      "fmla z27.h, z28.h, z0.h[6]\n"
+      "ld1h { z28.h }, p5/Z, [x11, #7, MUL VL]\n"
       "addvl x11, x11, #8\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "fmla z24.h, z6.h, z4.h[7]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[7]\n"
+      "fmla z12.h, z29.h, z3.h[7]\n"
+      "fmla z16.h, z29.h, z2.h[7]\n"
+      "fmla z20.h, z29.h, z1.h[7]\n"
+      "fmla z24.h, z29.h, z0.h[7]\n"
+      "fmla z9.h, z28.h, z4.h[7]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "fmla z25.h, z7.h, z4.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[7]\n"
+      "fmla z17.h, z28.h, z2.h[7]\n"
+      "fmla z21.h, z28.h, z1.h[7]\n"
+      "fmla z25.h, z28.h, z0.h[7]\n"
+      "ld1h { z28.h }, p5/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #8\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z26.h, z6.h, z4.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
-      "fmla z27.h, z7.h, z4.h[7]\n"
+      "fmla z10.h, z29.h, z4.h[7]\n"
+      "fmla z14.h, z29.h, z3.h[7]\n"
+      "fmla z18.h, z29.h, z2.h[7]\n"
+      "fmla z22.h, z29.h, z1.h[7]\n"
+      "fmla z26.h, z29.h, z0.h[7]\n"
+      "fmla z11.h, z28.h, z4.h[7]\n"
+      "fmla z15.h, z28.h, z3.h[7]\n"
+      "fmla z19.h, z28.h, z2.h[7]\n"
+      "fmla z23.h, z28.h, z1.h[7]\n"
+      "fmla z27.h, z28.h, z0.h[7]\n"
       "bgt 66b\n"
       "67:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
@@ -2203,243 +2203,243 @@
       "ld1rqh { z2.h }, p0/Z, [x24]\n"
       "ld1rqh { z3.h }, p0/Z, [x23]\n"
       "ld1rqh { z4.h }, p0/Z, [x22]\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "fmla z8.h, z29.h, z0.h[0]\n"
+      "fmla z12.h, z29.h, z1.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z16.h, z29.h, z2.h[0]\n"
+      "fmla z20.h, z29.h, z3.h[0]\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.h, z6.h, z4.h[0]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z29.h, z4.h[0]\n"
+      "fmla z9.h, z28.h, z0.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
+      "fmla z13.h, z28.h, z1.h[0]\n"
+      "fmla z17.h, z28.h, z2.h[0]\n"
       "addvl x10, x10, #1\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "fmla z25.h, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z21.h, z28.h, z3.h[0]\n"
+      "fmla z25.h, z28.h, z4.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z26.h, z6.h, z4.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "fmla z27.h, z7.h, z4.h[0]\n"
+      "fmla z10.h, z29.h, z0.h[0]\n"
+      "fmla z14.h, z29.h, z1.h[0]\n"
+      "fmla z18.h, z29.h, z2.h[0]\n"
+      "fmla z22.h, z29.h, z3.h[0]\n"
+      "fmla z26.h, z29.h, z4.h[0]\n"
+      "fmla z11.h, z28.h, z0.h[0]\n"
+      "fmla z15.h, z28.h, z1.h[0]\n"
+      "fmla z19.h, z28.h, z2.h[0]\n"
+      "fmla z23.h, z28.h, z3.h[0]\n"
+      "fmla z27.h, z28.h, z4.h[0]\n"
       "ble 68f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z29.h, z0.h[1]\n"
+      "fmla z12.h, z29.h, z1.h[1]\n"
+      "fmla z16.h, z29.h, z2.h[1]\n"
+      "fmla z20.h, z29.h, z3.h[1]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.h, z6.h, z4.h[1]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z29.h, z4.h[1]\n"
+      "fmla z9.h, z28.h, z0.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
+      "fmla z13.h, z28.h, z1.h[1]\n"
+      "fmla z17.h, z28.h, z2.h[1]\n"
       "addvl x10, x10, #1\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "fmla z25.h, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z21.h, z28.h, z3.h[1]\n"
+      "fmla z25.h, z28.h, z4.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z26.h, z6.h, z4.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "fmla z27.h, z7.h, z4.h[1]\n"
+      "fmla z10.h, z29.h, z0.h[1]\n"
+      "fmla z14.h, z29.h, z1.h[1]\n"
+      "fmla z18.h, z29.h, z2.h[1]\n"
+      "fmla z22.h, z29.h, z3.h[1]\n"
+      "fmla z26.h, z29.h, z4.h[1]\n"
+      "fmla z11.h, z28.h, z0.h[1]\n"
+      "fmla z15.h, z28.h, z1.h[1]\n"
+      "fmla z19.h, z28.h, z2.h[1]\n"
+      "fmla z23.h, z28.h, z3.h[1]\n"
+      "fmla z27.h, z28.h, z4.h[1]\n"
       "ble 68f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z29.h, z0.h[2]\n"
+      "fmla z12.h, z29.h, z1.h[2]\n"
+      "fmla z16.h, z29.h, z2.h[2]\n"
+      "fmla z20.h, z29.h, z3.h[2]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.h, z6.h, z4.h[2]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z29.h, z4.h[2]\n"
+      "fmla z9.h, z28.h, z0.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
+      "fmla z13.h, z28.h, z1.h[2]\n"
+      "fmla z17.h, z28.h, z2.h[2]\n"
       "addvl x10, x10, #1\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "fmla z25.h, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z21.h, z28.h, z3.h[2]\n"
+      "fmla z25.h, z28.h, z4.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z26.h, z6.h, z4.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "fmla z27.h, z7.h, z4.h[2]\n"
+      "fmla z10.h, z29.h, z0.h[2]\n"
+      "fmla z14.h, z29.h, z1.h[2]\n"
+      "fmla z18.h, z29.h, z2.h[2]\n"
+      "fmla z22.h, z29.h, z3.h[2]\n"
+      "fmla z26.h, z29.h, z4.h[2]\n"
+      "fmla z11.h, z28.h, z0.h[2]\n"
+      "fmla z15.h, z28.h, z1.h[2]\n"
+      "fmla z19.h, z28.h, z2.h[2]\n"
+      "fmla z23.h, z28.h, z3.h[2]\n"
+      "fmla z27.h, z28.h, z4.h[2]\n"
       "ble 68f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z29.h, z0.h[3]\n"
+      "fmla z12.h, z29.h, z1.h[3]\n"
+      "fmla z16.h, z29.h, z2.h[3]\n"
+      "fmla z20.h, z29.h, z3.h[3]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.h, z6.h, z4.h[3]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z29.h, z4.h[3]\n"
+      "fmla z9.h, z28.h, z0.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
+      "fmla z13.h, z28.h, z1.h[3]\n"
+      "fmla z17.h, z28.h, z2.h[3]\n"
       "addvl x10, x10, #1\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "fmla z25.h, z7.h, z4.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z21.h, z28.h, z3.h[3]\n"
+      "fmla z25.h, z28.h, z4.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z26.h, z6.h, z4.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "fmla z27.h, z7.h, z4.h[3]\n"
+      "fmla z10.h, z29.h, z0.h[3]\n"
+      "fmla z14.h, z29.h, z1.h[3]\n"
+      "fmla z18.h, z29.h, z2.h[3]\n"
+      "fmla z22.h, z29.h, z3.h[3]\n"
+      "fmla z26.h, z29.h, z4.h[3]\n"
+      "fmla z11.h, z28.h, z0.h[3]\n"
+      "fmla z15.h, z28.h, z1.h[3]\n"
+      "fmla z19.h, z28.h, z2.h[3]\n"
+      "fmla z23.h, z28.h, z3.h[3]\n"
+      "fmla z27.h, z28.h, z4.h[3]\n"
       "ble 68f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z29.h, z0.h[4]\n"
+      "fmla z12.h, z29.h, z1.h[4]\n"
+      "fmla z16.h, z29.h, z2.h[4]\n"
+      "fmla z20.h, z29.h, z3.h[4]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.h, z6.h, z4.h[4]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z29.h, z4.h[4]\n"
+      "fmla z9.h, z28.h, z0.h[4]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
+      "fmla z13.h, z28.h, z1.h[4]\n"
+      "fmla z17.h, z28.h, z2.h[4]\n"
       "addvl x10, x10, #1\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "fmla z25.h, z7.h, z4.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z21.h, z28.h, z3.h[4]\n"
+      "fmla z25.h, z28.h, z4.h[4]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z26.h, z6.h, z4.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "fmla z27.h, z7.h, z4.h[4]\n"
+      "fmla z10.h, z29.h, z0.h[4]\n"
+      "fmla z14.h, z29.h, z1.h[4]\n"
+      "fmla z18.h, z29.h, z2.h[4]\n"
+      "fmla z22.h, z29.h, z3.h[4]\n"
+      "fmla z26.h, z29.h, z4.h[4]\n"
+      "fmla z11.h, z28.h, z0.h[4]\n"
+      "fmla z15.h, z28.h, z1.h[4]\n"
+      "fmla z19.h, z28.h, z2.h[4]\n"
+      "fmla z23.h, z28.h, z3.h[4]\n"
+      "fmla z27.h, z28.h, z4.h[4]\n"
       "ble 68f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z29.h, z0.h[5]\n"
+      "fmla z12.h, z29.h, z1.h[5]\n"
+      "fmla z16.h, z29.h, z2.h[5]\n"
+      "fmla z20.h, z29.h, z3.h[5]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.h, z6.h, z4.h[5]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z29.h, z4.h[5]\n"
+      "fmla z9.h, z28.h, z0.h[5]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
+      "fmla z13.h, z28.h, z1.h[5]\n"
+      "fmla z17.h, z28.h, z2.h[5]\n"
       "addvl x10, x10, #1\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "fmla z25.h, z7.h, z4.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z21.h, z28.h, z3.h[5]\n"
+      "fmla z25.h, z28.h, z4.h[5]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z26.h, z6.h, z4.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "fmla z27.h, z7.h, z4.h[5]\n"
+      "fmla z10.h, z29.h, z0.h[5]\n"
+      "fmla z14.h, z29.h, z1.h[5]\n"
+      "fmla z18.h, z29.h, z2.h[5]\n"
+      "fmla z22.h, z29.h, z3.h[5]\n"
+      "fmla z26.h, z29.h, z4.h[5]\n"
+      "fmla z11.h, z28.h, z0.h[5]\n"
+      "fmla z15.h, z28.h, z1.h[5]\n"
+      "fmla z19.h, z28.h, z2.h[5]\n"
+      "fmla z23.h, z28.h, z3.h[5]\n"
+      "fmla z27.h, z28.h, z4.h[5]\n"
       "ble 68f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z29.h, z0.h[6]\n"
+      "fmla z12.h, z29.h, z1.h[6]\n"
+      "fmla z16.h, z29.h, z2.h[6]\n"
+      "fmla z20.h, z29.h, z3.h[6]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.h, z6.h, z4.h[6]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z29.h, z4.h[6]\n"
+      "fmla z9.h, z28.h, z0.h[6]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
+      "fmla z13.h, z28.h, z1.h[6]\n"
+      "fmla z17.h, z28.h, z2.h[6]\n"
       "addvl x10, x10, #1\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "fmla z25.h, z7.h, z4.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z21.h, z28.h, z3.h[6]\n"
+      "fmla z25.h, z28.h, z4.h[6]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z26.h, z6.h, z4.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "fmla z27.h, z7.h, z4.h[6]\n"
+      "fmla z10.h, z29.h, z0.h[6]\n"
+      "fmla z14.h, z29.h, z1.h[6]\n"
+      "fmla z18.h, z29.h, z2.h[6]\n"
+      "fmla z22.h, z29.h, z3.h[6]\n"
+      "fmla z26.h, z29.h, z4.h[6]\n"
+      "fmla z11.h, z28.h, z0.h[6]\n"
+      "fmla z15.h, z28.h, z1.h[6]\n"
+      "fmla z19.h, z28.h, z2.h[6]\n"
+      "fmla z23.h, z28.h, z3.h[6]\n"
+      "fmla z27.h, z28.h, z4.h[6]\n"
       "ble 68f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
+      "ld1h { z29.h }, p5/Z, [x12]\n"
+      "ld1h { z28.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z29.h, z0.h[7]\n"
+      "fmla z12.h, z29.h, z1.h[7]\n"
+      "fmla z16.h, z29.h, z2.h[7]\n"
+      "fmla z20.h, z29.h, z3.h[7]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z24.h, z6.h, z4.h[7]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z29.h, z4.h[7]\n"
+      "fmla z9.h, z28.h, z0.h[7]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
       "addvl x10, x10, #1\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "fmla z25.h, z7.h, z4.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z13.h, z28.h, z1.h[7]\n"
+      "fmla z17.h, z28.h, z2.h[7]\n"
+      "fmla z21.h, z28.h, z3.h[7]\n"
+      "fmla z25.h, z28.h, z4.h[7]\n"
+      "ld1h { z28.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z26.h, z6.h, z4.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
-      "fmla z27.h, z7.h, z4.h[7]\n"
+      "fmla z10.h, z29.h, z0.h[7]\n"
+      "fmla z14.h, z29.h, z1.h[7]\n"
+      "fmla z18.h, z29.h, z2.h[7]\n"
+      "fmla z22.h, z29.h, z3.h[7]\n"
+      "fmla z26.h, z29.h, z4.h[7]\n"
+      "fmla z11.h, z28.h, z0.h[7]\n"
+      "fmla z15.h, z28.h, z1.h[7]\n"
+      "fmla z19.h, z28.h, z2.h[7]\n"
+      "fmla z23.h, z28.h, z3.h[7]\n"
+      "fmla z27.h, z28.h, z4.h[7]\n"
       "68:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -2452,49 +2452,49 @@
       "add x22, x23, x20, LSL #1\n"
       "tbz %x[flags], #1, 69f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p5/Z, [x20]\n"
+      "ld1rh { z29.h }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p5/Z, [x20]\n"
-      "fmin z8.h, p5/M, z8.h, z1.h\n"
-      "fmin z9.h, p5/M, z9.h, z1.h\n"
-      "fmin z10.h, p5/M, z10.h, z1.h\n"
-      "fmin z11.h, p5/M, z11.h, z1.h\n"
-      "fmin z12.h, p5/M, z12.h, z1.h\n"
-      "fmin z13.h, p5/M, z13.h, z1.h\n"
-      "fmin z14.h, p5/M, z14.h, z1.h\n"
-      "fmin z15.h, p5/M, z15.h, z1.h\n"
-      "fmin z16.h, p5/M, z16.h, z1.h\n"
-      "fmin z17.h, p5/M, z17.h, z1.h\n"
-      "fmin z18.h, p5/M, z18.h, z1.h\n"
-      "fmin z19.h, p5/M, z19.h, z1.h\n"
-      "fmin z20.h, p5/M, z20.h, z1.h\n"
-      "fmin z21.h, p5/M, z21.h, z1.h\n"
-      "fmin z22.h, p5/M, z22.h, z1.h\n"
-      "fmin z23.h, p5/M, z23.h, z1.h\n"
-      "fmin z24.h, p5/M, z24.h, z1.h\n"
-      "fmin z25.h, p5/M, z25.h, z1.h\n"
-      "fmin z26.h, p5/M, z26.h, z1.h\n"
-      "fmin z27.h, p5/M, z27.h, z1.h\n"
-      "fmax z8.h, p5/M, z8.h, z0.h\n"
-      "fmax z9.h, p5/M, z9.h, z0.h\n"
-      "fmax z10.h, p5/M, z10.h, z0.h\n"
-      "fmax z11.h, p5/M, z11.h, z0.h\n"
-      "fmax z12.h, p5/M, z12.h, z0.h\n"
-      "fmax z13.h, p5/M, z13.h, z0.h\n"
-      "fmax z14.h, p5/M, z14.h, z0.h\n"
-      "fmax z15.h, p5/M, z15.h, z0.h\n"
-      "fmax z16.h, p5/M, z16.h, z0.h\n"
-      "fmax z17.h, p5/M, z17.h, z0.h\n"
-      "fmax z18.h, p5/M, z18.h, z0.h\n"
-      "fmax z19.h, p5/M, z19.h, z0.h\n"
-      "fmax z20.h, p5/M, z20.h, z0.h\n"
-      "fmax z21.h, p5/M, z21.h, z0.h\n"
-      "fmax z22.h, p5/M, z22.h, z0.h\n"
-      "fmax z23.h, p5/M, z23.h, z0.h\n"
-      "fmax z24.h, p5/M, z24.h, z0.h\n"
-      "fmax z25.h, p5/M, z25.h, z0.h\n"
-      "fmax z26.h, p5/M, z26.h, z0.h\n"
-      "fmax z27.h, p5/M, z27.h, z0.h\n"
+      "ld1rh { z28.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z29.h\n"
+      "fmin z9.h, p5/M, z9.h, z29.h\n"
+      "fmin z10.h, p5/M, z10.h, z29.h\n"
+      "fmin z11.h, p5/M, z11.h, z29.h\n"
+      "fmin z12.h, p5/M, z12.h, z29.h\n"
+      "fmin z13.h, p5/M, z13.h, z29.h\n"
+      "fmin z14.h, p5/M, z14.h, z29.h\n"
+      "fmin z15.h, p5/M, z15.h, z29.h\n"
+      "fmin z16.h, p5/M, z16.h, z29.h\n"
+      "fmin z17.h, p5/M, z17.h, z29.h\n"
+      "fmin z18.h, p5/M, z18.h, z29.h\n"
+      "fmin z19.h, p5/M, z19.h, z29.h\n"
+      "fmin z20.h, p5/M, z20.h, z29.h\n"
+      "fmin z21.h, p5/M, z21.h, z29.h\n"
+      "fmin z22.h, p5/M, z22.h, z29.h\n"
+      "fmin z23.h, p5/M, z23.h, z29.h\n"
+      "fmin z24.h, p5/M, z24.h, z29.h\n"
+      "fmin z25.h, p5/M, z25.h, z29.h\n"
+      "fmin z26.h, p5/M, z26.h, z29.h\n"
+      "fmin z27.h, p5/M, z27.h, z29.h\n"
+      "fmax z8.h, p5/M, z8.h, z28.h\n"
+      "fmax z9.h, p5/M, z9.h, z28.h\n"
+      "fmax z10.h, p5/M, z10.h, z28.h\n"
+      "fmax z11.h, p5/M, z11.h, z28.h\n"
+      "fmax z12.h, p5/M, z12.h, z28.h\n"
+      "fmax z13.h, p5/M, z13.h, z28.h\n"
+      "fmax z14.h, p5/M, z14.h, z28.h\n"
+      "fmax z15.h, p5/M, z15.h, z28.h\n"
+      "fmax z16.h, p5/M, z16.h, z28.h\n"
+      "fmax z17.h, p5/M, z17.h, z28.h\n"
+      "fmax z18.h, p5/M, z18.h, z28.h\n"
+      "fmax z19.h, p5/M, z19.h, z28.h\n"
+      "fmax z20.h, p5/M, z20.h, z28.h\n"
+      "fmax z21.h, p5/M, z21.h, z28.h\n"
+      "fmax z22.h, p5/M, z22.h, z28.h\n"
+      "fmax z23.h, p5/M, z23.h, z28.h\n"
+      "fmax z24.h, p5/M, z24.h, z28.h\n"
+      "fmax z25.h, p5/M, z25.h, z28.h\n"
+      "fmax z26.h, p5/M, z26.h, z28.h\n"
+      "fmax z27.h, p5/M, z27.h, z28.h\n"
       "69:"  // Height 5: No activation
       "st1h { z8.h }, p4, [x13]\n"
       "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
@@ -2590,35 +2590,35 @@
       "74:"  // Height 6: no bias
       "tbz %x[flags], #0, 75f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "add x24, x13, x20, LSL #1\n"
       "add x23, x24, x20, LSL #1\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
       "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
       "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
       "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1h { z12.h }, p4/Z, [x25]\n"
-      "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p4/Z, [x24]\n"
-      "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z20.h }, p4/Z, [x23]\n"
-      "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1h { z24.h }, p4/Z, [x22]\n"
-      "ld1h { z25.h }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1h { z26.h }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1h { z28.h }, p4/Z, [x21]\n"
-      "ld1h { z29.h }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1h { z30.h }, p2/Z, [x21, #2, MUL VL]\n"
-      "ld1h { z31.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x24]\n"
+      "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x23]\n"
+      "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x22]\n"
+      "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x21]\n"
+      "ld1h { z25.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z26.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z27.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z28.h }, p4/Z, [x20]\n"
+      "ld1h { z29.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z30.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z31.h }, p1/Z, [x20, #3, MUL VL]\n"
       "b 76f\n"
       "75:"  // Height 6: no accumulate
       "mov z8.b, #0x0\n"
@@ -2650,16 +2650,16 @@
       "77:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 78f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 79f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -2671,258 +2671,258 @@
       "b 79f\n"
       "78:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "79:"  // Height 6: input setup done
       "cmp x27, #0x8\n"
       "ble 81f\n"
       "80:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z7.h }, p0/Z, [x26]\n"
+      "ld1rqh { z6.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
-      "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z5.h }, p0/Z, [x24]\n"
+      "ld1rqh { z4.h }, p0/Z, [x23]\n"
       "cmp x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      "ld1rqh { z4.h }, p0/Z, [x22]\n"
-      "ld1rqh { z5.h }, p0/Z, [x21]\n"
+      "ld1rqh { z3.h }, p0/Z, [x22]\n"
+      "ld1rqh { z2.h }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x12]\n"
+      "ld1h { z0.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z1.h, z7.h[0]\n"
+      "fmla z12.h, z1.h, z6.h[0]\n"
+      "fmla z16.h, z1.h, z5.h[0]\n"
+      "fmla z20.h, z1.h, z4.h[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "fmla z24.h, z6.h, z4.h[0]\n"
-      "fmla z28.h, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z1.h, z3.h[0]\n"
+      "fmla z28.h, z1.h, z2.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
       "add x21, x21, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "fmla z25.h, z7.h, z4.h[0]\n"
-      "fmla z29.h, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z26.h, z6.h, z4.h[0]\n"
-      "fmla z30.h, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "fmla z27.h, z7.h, z4.h[0]\n"
-      "fmla z31.h, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
-      "fmla z24.h, z6.h, z4.h[1]\n"
-      "fmla z28.h, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "fmla z25.h, z7.h, z4.h[1]\n"
-      "fmla z29.h, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z26.h, z6.h, z4.h[1]\n"
-      "fmla z30.h, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "fmla z27.h, z7.h, z4.h[1]\n"
-      "fmla z31.h, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
-      "fmla z24.h, z6.h, z4.h[2]\n"
-      "fmla z28.h, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "fmla z25.h, z7.h, z4.h[2]\n"
-      "fmla z29.h, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z26.h, z6.h, z4.h[2]\n"
-      "fmla z30.h, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "fmla z27.h, z7.h, z4.h[2]\n"
-      "fmla z31.h, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
-      "fmla z24.h, z6.h, z4.h[3]\n"
-      "fmla z28.h, z6.h, z5.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "fmla z25.h, z7.h, z4.h[3]\n"
-      "fmla z29.h, z7.h, z5.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z26.h, z6.h, z4.h[3]\n"
-      "fmla z30.h, z6.h, z5.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "fmla z27.h, z7.h, z4.h[3]\n"
-      "fmla z31.h, z7.h, z5.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
-      "fmla z24.h, z6.h, z4.h[4]\n"
-      "fmla z28.h, z6.h, z5.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "fmla z25.h, z7.h, z4.h[4]\n"
-      "fmla z29.h, z7.h, z5.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z26.h, z6.h, z4.h[4]\n"
-      "fmla z30.h, z6.h, z5.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "fmla z27.h, z7.h, z4.h[4]\n"
-      "fmla z31.h, z7.h, z5.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
-      "fmla z24.h, z6.h, z4.h[5]\n"
-      "fmla z28.h, z6.h, z5.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "fmla z25.h, z7.h, z4.h[5]\n"
-      "fmla z29.h, z7.h, z5.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z26.h, z6.h, z4.h[5]\n"
-      "fmla z30.h, z6.h, z5.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "fmla z27.h, z7.h, z4.h[5]\n"
-      "fmla z31.h, z7.h, z5.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
-      "fmla z24.h, z6.h, z4.h[6]\n"
-      "fmla z28.h, z6.h, z5.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "fmla z25.h, z7.h, z4.h[6]\n"
-      "fmla z29.h, z7.h, z5.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z26.h, z6.h, z4.h[6]\n"
-      "fmla z30.h, z6.h, z5.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[0]\n"
+      "fmla z13.h, z0.h, z6.h[0]\n"
+      "fmla z17.h, z0.h, z5.h[0]\n"
+      "fmla z21.h, z0.h, z4.h[0]\n"
+      "fmla z25.h, z0.h, z3.h[0]\n"
+      "fmla z29.h, z0.h, z2.h[0]\n"
+      "ld1h { z0.h }, p5/Z, [x9]\n"
+      "fmla z10.h, z1.h, z7.h[0]\n"
+      "fmla z14.h, z1.h, z6.h[0]\n"
+      "fmla z18.h, z1.h, z5.h[0]\n"
+      "fmla z22.h, z1.h, z4.h[0]\n"
+      "fmla z26.h, z1.h, z3.h[0]\n"
+      "fmla z30.h, z1.h, z2.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[0]\n"
+      "fmla z15.h, z0.h, z6.h[0]\n"
+      "fmla z19.h, z0.h, z5.h[0]\n"
+      "fmla z23.h, z0.h, z4.h[0]\n"
+      "fmla z27.h, z0.h, z3.h[0]\n"
+      "fmla z31.h, z0.h, z2.h[0]\n"
+      "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[1]\n"
+      "fmla z12.h, z1.h, z6.h[1]\n"
+      "fmla z16.h, z1.h, z5.h[1]\n"
+      "fmla z20.h, z1.h, z4.h[1]\n"
+      "fmla z24.h, z1.h, z3.h[1]\n"
+      "fmla z28.h, z1.h, z2.h[1]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[1]\n"
+      "fmla z13.h, z0.h, z6.h[1]\n"
+      "fmla z17.h, z0.h, z5.h[1]\n"
+      "fmla z21.h, z0.h, z4.h[1]\n"
+      "fmla z25.h, z0.h, z3.h[1]\n"
+      "fmla z29.h, z0.h, z2.h[1]\n"
+      "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[1]\n"
+      "fmla z14.h, z1.h, z6.h[1]\n"
+      "fmla z18.h, z1.h, z5.h[1]\n"
+      "fmla z22.h, z1.h, z4.h[1]\n"
+      "fmla z26.h, z1.h, z3.h[1]\n"
+      "fmla z30.h, z1.h, z2.h[1]\n"
+      "ld1h { z1.h }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[1]\n"
+      "fmla z15.h, z0.h, z6.h[1]\n"
+      "fmla z19.h, z0.h, z5.h[1]\n"
+      "fmla z23.h, z0.h, z4.h[1]\n"
+      "fmla z27.h, z0.h, z3.h[1]\n"
+      "fmla z31.h, z0.h, z2.h[1]\n"
+      "ld1h { z0.h }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[2]\n"
+      "fmla z12.h, z1.h, z6.h[2]\n"
+      "fmla z16.h, z1.h, z5.h[2]\n"
+      "fmla z20.h, z1.h, z4.h[2]\n"
+      "fmla z24.h, z1.h, z3.h[2]\n"
+      "fmla z28.h, z1.h, z2.h[2]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[2]\n"
+      "fmla z13.h, z0.h, z6.h[2]\n"
+      "fmla z17.h, z0.h, z5.h[2]\n"
+      "fmla z21.h, z0.h, z4.h[2]\n"
+      "fmla z25.h, z0.h, z3.h[2]\n"
+      "fmla z29.h, z0.h, z2.h[2]\n"
+      "ld1h { z0.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[2]\n"
+      "fmla z14.h, z1.h, z6.h[2]\n"
+      "fmla z18.h, z1.h, z5.h[2]\n"
+      "fmla z22.h, z1.h, z4.h[2]\n"
+      "fmla z26.h, z1.h, z3.h[2]\n"
+      "fmla z30.h, z1.h, z2.h[2]\n"
+      "ld1h { z1.h }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[2]\n"
+      "fmla z15.h, z0.h, z6.h[2]\n"
+      "fmla z19.h, z0.h, z5.h[2]\n"
+      "fmla z23.h, z0.h, z4.h[2]\n"
+      "fmla z27.h, z0.h, z3.h[2]\n"
+      "fmla z31.h, z0.h, z2.h[2]\n"
+      "ld1h { z0.h }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[3]\n"
+      "fmla z12.h, z1.h, z6.h[3]\n"
+      "fmla z16.h, z1.h, z5.h[3]\n"
+      "fmla z20.h, z1.h, z4.h[3]\n"
+      "fmla z24.h, z1.h, z3.h[3]\n"
+      "fmla z28.h, z1.h, z2.h[3]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[3]\n"
+      "fmla z13.h, z0.h, z6.h[3]\n"
+      "fmla z17.h, z0.h, z5.h[3]\n"
+      "fmla z21.h, z0.h, z4.h[3]\n"
+      "fmla z25.h, z0.h, z3.h[3]\n"
+      "fmla z29.h, z0.h, z2.h[3]\n"
+      "ld1h { z0.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[3]\n"
+      "fmla z14.h, z1.h, z6.h[3]\n"
+      "fmla z18.h, z1.h, z5.h[3]\n"
+      "fmla z22.h, z1.h, z4.h[3]\n"
+      "fmla z26.h, z1.h, z3.h[3]\n"
+      "fmla z30.h, z1.h, z2.h[3]\n"
+      "ld1h { z1.h }, p5/Z, [x12, #4, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[3]\n"
+      "fmla z15.h, z0.h, z6.h[3]\n"
+      "fmla z19.h, z0.h, z5.h[3]\n"
+      "fmla z23.h, z0.h, z4.h[3]\n"
+      "fmla z27.h, z0.h, z3.h[3]\n"
+      "fmla z31.h, z0.h, z2.h[3]\n"
+      "ld1h { z0.h }, p5/Z, [x11, #4, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[4]\n"
+      "fmla z12.h, z1.h, z6.h[4]\n"
+      "fmla z16.h, z1.h, z5.h[4]\n"
+      "fmla z20.h, z1.h, z4.h[4]\n"
+      "fmla z24.h, z1.h, z3.h[4]\n"
+      "fmla z28.h, z1.h, z2.h[4]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[4]\n"
+      "fmla z13.h, z0.h, z6.h[4]\n"
+      "fmla z17.h, z0.h, z5.h[4]\n"
+      "fmla z21.h, z0.h, z4.h[4]\n"
+      "fmla z25.h, z0.h, z3.h[4]\n"
+      "fmla z29.h, z0.h, z2.h[4]\n"
+      "ld1h { z0.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[4]\n"
+      "fmla z14.h, z1.h, z6.h[4]\n"
+      "fmla z18.h, z1.h, z5.h[4]\n"
+      "fmla z22.h, z1.h, z4.h[4]\n"
+      "fmla z26.h, z1.h, z3.h[4]\n"
+      "fmla z30.h, z1.h, z2.h[4]\n"
+      "ld1h { z1.h }, p5/Z, [x12, #5, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[4]\n"
+      "fmla z15.h, z0.h, z6.h[4]\n"
+      "fmla z19.h, z0.h, z5.h[4]\n"
+      "fmla z23.h, z0.h, z4.h[4]\n"
+      "fmla z27.h, z0.h, z3.h[4]\n"
+      "fmla z31.h, z0.h, z2.h[4]\n"
+      "ld1h { z0.h }, p5/Z, [x11, #5, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[5]\n"
+      "fmla z12.h, z1.h, z6.h[5]\n"
+      "fmla z16.h, z1.h, z5.h[5]\n"
+      "fmla z20.h, z1.h, z4.h[5]\n"
+      "fmla z24.h, z1.h, z3.h[5]\n"
+      "fmla z28.h, z1.h, z2.h[5]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[5]\n"
+      "fmla z13.h, z0.h, z6.h[5]\n"
+      "fmla z17.h, z0.h, z5.h[5]\n"
+      "fmla z21.h, z0.h, z4.h[5]\n"
+      "fmla z25.h, z0.h, z3.h[5]\n"
+      "fmla z29.h, z0.h, z2.h[5]\n"
+      "ld1h { z0.h }, p5/Z, [x9, #5, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[5]\n"
+      "fmla z14.h, z1.h, z6.h[5]\n"
+      "fmla z18.h, z1.h, z5.h[5]\n"
+      "fmla z22.h, z1.h, z4.h[5]\n"
+      "fmla z26.h, z1.h, z3.h[5]\n"
+      "fmla z30.h, z1.h, z2.h[5]\n"
+      "ld1h { z1.h }, p5/Z, [x12, #6, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[5]\n"
+      "fmla z15.h, z0.h, z6.h[5]\n"
+      "fmla z19.h, z0.h, z5.h[5]\n"
+      "fmla z23.h, z0.h, z4.h[5]\n"
+      "fmla z27.h, z0.h, z3.h[5]\n"
+      "fmla z31.h, z0.h, z2.h[5]\n"
+      "ld1h { z0.h }, p5/Z, [x11, #6, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[6]\n"
+      "fmla z12.h, z1.h, z6.h[6]\n"
+      "fmla z16.h, z1.h, z5.h[6]\n"
+      "fmla z20.h, z1.h, z4.h[6]\n"
+      "fmla z24.h, z1.h, z3.h[6]\n"
+      "fmla z28.h, z1.h, z2.h[6]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[6]\n"
+      "fmla z13.h, z0.h, z6.h[6]\n"
+      "fmla z17.h, z0.h, z5.h[6]\n"
+      "fmla z21.h, z0.h, z4.h[6]\n"
+      "fmla z25.h, z0.h, z3.h[6]\n"
+      "fmla z29.h, z0.h, z2.h[6]\n"
+      "ld1h { z0.h }, p5/Z, [x9, #6, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[6]\n"
+      "fmla z14.h, z1.h, z6.h[6]\n"
+      "fmla z18.h, z1.h, z5.h[6]\n"
+      "fmla z22.h, z1.h, z4.h[6]\n"
+      "fmla z26.h, z1.h, z3.h[6]\n"
+      "fmla z30.h, z1.h, z2.h[6]\n"
+      "ld1h { z1.h }, p5/Z, [x12, #7, MUL VL]\n"
       "addvl x12, x12, #8\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "fmla z27.h, z7.h, z4.h[6]\n"
-      "fmla z31.h, z7.h, z5.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[6]\n"
+      "fmla z15.h, z0.h, z6.h[6]\n"
+      "fmla z19.h, z0.h, z5.h[6]\n"
+      "fmla z23.h, z0.h, z4.h[6]\n"
+      "fmla z27.h, z0.h, z3.h[6]\n"
+      "fmla z31.h, z0.h, z2.h[6]\n"
+      "ld1h { z0.h }, p5/Z, [x11, #7, MUL VL]\n"
       "addvl x11, x11, #8\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "fmla z24.h, z6.h, z4.h[7]\n"
-      "fmla z28.h, z6.h, z5.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[7]\n"
+      "fmla z12.h, z1.h, z6.h[7]\n"
+      "fmla z16.h, z1.h, z5.h[7]\n"
+      "fmla z20.h, z1.h, z4.h[7]\n"
+      "fmla z24.h, z1.h, z3.h[7]\n"
+      "fmla z28.h, z1.h, z2.h[7]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "fmla z25.h, z7.h, z4.h[7]\n"
-      "fmla z29.h, z7.h, z5.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[7]\n"
+      "fmla z13.h, z0.h, z6.h[7]\n"
+      "fmla z17.h, z0.h, z5.h[7]\n"
+      "fmla z21.h, z0.h, z4.h[7]\n"
+      "fmla z25.h, z0.h, z3.h[7]\n"
+      "fmla z29.h, z0.h, z2.h[7]\n"
+      "ld1h { z0.h }, p5/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #8\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z26.h, z6.h, z4.h[7]\n"
-      "fmla z30.h, z6.h, z5.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
-      "fmla z27.h, z7.h, z4.h[7]\n"
-      "fmla z31.h, z7.h, z5.h[7]\n"
+      "fmla z10.h, z1.h, z7.h[7]\n"
+      "fmla z14.h, z1.h, z6.h[7]\n"
+      "fmla z18.h, z1.h, z5.h[7]\n"
+      "fmla z22.h, z1.h, z4.h[7]\n"
+      "fmla z26.h, z1.h, z3.h[7]\n"
+      "fmla z30.h, z1.h, z2.h[7]\n"
+      "fmla z11.h, z0.h, z7.h[7]\n"
+      "fmla z15.h, z0.h, z6.h[7]\n"
+      "fmla z19.h, z0.h, z5.h[7]\n"
+      "fmla z23.h, z0.h, z4.h[7]\n"
+      "fmla z27.h, z0.h, z3.h[7]\n"
+      "fmla z31.h, z0.h, z2.h[7]\n"
       "bgt 80b\n"
       "81:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
@@ -2933,275 +2933,275 @@
       "ld1rqh { z3.h }, p0/Z, [x23]\n"
       "ld1rqh { z4.h }, p0/Z, [x22]\n"
       "ld1rqh { z5.h }, p0/Z, [x21]\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[0]\n"
+      "fmla z12.h, z7.h, z1.h[0]\n"
+      "fmla z16.h, z7.h, z2.h[0]\n"
+      "fmla z20.h, z7.h, z3.h[0]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z24.h, z6.h, z4.h[0]\n"
-      "fmla z28.h, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z7.h, z4.h[0]\n"
+      "fmla z28.h, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
       "addvl x10, x10, #1\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "fmla z25.h, z7.h, z4.h[0]\n"
-      "fmla z29.h, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z9.h, z6.h, z0.h[0]\n"
+      "fmla z13.h, z6.h, z1.h[0]\n"
+      "fmla z17.h, z6.h, z2.h[0]\n"
+      "fmla z21.h, z6.h, z3.h[0]\n"
+      "fmla z25.h, z6.h, z4.h[0]\n"
+      "fmla z29.h, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z26.h, z6.h, z4.h[0]\n"
-      "fmla z30.h, z6.h, z5.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "fmla z27.h, z7.h, z4.h[0]\n"
-      "fmla z31.h, z7.h, z5.h[0]\n"
+      "fmla z10.h, z7.h, z0.h[0]\n"
+      "fmla z14.h, z7.h, z1.h[0]\n"
+      "fmla z18.h, z7.h, z2.h[0]\n"
+      "fmla z22.h, z7.h, z3.h[0]\n"
+      "fmla z26.h, z7.h, z4.h[0]\n"
+      "fmla z30.h, z7.h, z5.h[0]\n"
+      "fmla z11.h, z6.h, z0.h[0]\n"
+      "fmla z15.h, z6.h, z1.h[0]\n"
+      "fmla z19.h, z6.h, z2.h[0]\n"
+      "fmla z23.h, z6.h, z3.h[0]\n"
+      "fmla z27.h, z6.h, z4.h[0]\n"
+      "fmla z31.h, z6.h, z5.h[0]\n"
       "ble 82f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[1]\n"
+      "fmla z12.h, z7.h, z1.h[1]\n"
+      "fmla z16.h, z7.h, z2.h[1]\n"
+      "fmla z20.h, z7.h, z3.h[1]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.h, z6.h, z4.h[1]\n"
-      "fmla z28.h, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z7.h, z4.h[1]\n"
+      "fmla z28.h, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z9.h, z6.h, z0.h[1]\n"
+      "fmla z13.h, z6.h, z1.h[1]\n"
       "addvl x10, x10, #1\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "fmla z25.h, z7.h, z4.h[1]\n"
-      "fmla z29.h, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z17.h, z6.h, z2.h[1]\n"
+      "fmla z21.h, z6.h, z3.h[1]\n"
+      "fmla z25.h, z6.h, z4.h[1]\n"
+      "fmla z29.h, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z26.h, z6.h, z4.h[1]\n"
-      "fmla z30.h, z6.h, z5.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "fmla z27.h, z7.h, z4.h[1]\n"
-      "fmla z31.h, z7.h, z5.h[1]\n"
+      "fmla z10.h, z7.h, z0.h[1]\n"
+      "fmla z14.h, z7.h, z1.h[1]\n"
+      "fmla z18.h, z7.h, z2.h[1]\n"
+      "fmla z22.h, z7.h, z3.h[1]\n"
+      "fmla z26.h, z7.h, z4.h[1]\n"
+      "fmla z30.h, z7.h, z5.h[1]\n"
+      "fmla z11.h, z6.h, z0.h[1]\n"
+      "fmla z15.h, z6.h, z1.h[1]\n"
+      "fmla z19.h, z6.h, z2.h[1]\n"
+      "fmla z23.h, z6.h, z3.h[1]\n"
+      "fmla z27.h, z6.h, z4.h[1]\n"
+      "fmla z31.h, z6.h, z5.h[1]\n"
       "ble 82f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[2]\n"
+      "fmla z12.h, z7.h, z1.h[2]\n"
+      "fmla z16.h, z7.h, z2.h[2]\n"
+      "fmla z20.h, z7.h, z3.h[2]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.h, z6.h, z4.h[2]\n"
-      "fmla z28.h, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z7.h, z4.h[2]\n"
+      "fmla z28.h, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z9.h, z6.h, z0.h[2]\n"
+      "fmla z13.h, z6.h, z1.h[2]\n"
       "addvl x10, x10, #1\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "fmla z25.h, z7.h, z4.h[2]\n"
-      "fmla z29.h, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z17.h, z6.h, z2.h[2]\n"
+      "fmla z21.h, z6.h, z3.h[2]\n"
+      "fmla z25.h, z6.h, z4.h[2]\n"
+      "fmla z29.h, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z26.h, z6.h, z4.h[2]\n"
-      "fmla z30.h, z6.h, z5.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "fmla z27.h, z7.h, z4.h[2]\n"
-      "fmla z31.h, z7.h, z5.h[2]\n"
+      "fmla z10.h, z7.h, z0.h[2]\n"
+      "fmla z14.h, z7.h, z1.h[2]\n"
+      "fmla z18.h, z7.h, z2.h[2]\n"
+      "fmla z22.h, z7.h, z3.h[2]\n"
+      "fmla z26.h, z7.h, z4.h[2]\n"
+      "fmla z30.h, z7.h, z5.h[2]\n"
+      "fmla z11.h, z6.h, z0.h[2]\n"
+      "fmla z15.h, z6.h, z1.h[2]\n"
+      "fmla z19.h, z6.h, z2.h[2]\n"
+      "fmla z23.h, z6.h, z3.h[2]\n"
+      "fmla z27.h, z6.h, z4.h[2]\n"
+      "fmla z31.h, z6.h, z5.h[2]\n"
       "ble 82f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[3]\n"
+      "fmla z12.h, z7.h, z1.h[3]\n"
+      "fmla z16.h, z7.h, z2.h[3]\n"
+      "fmla z20.h, z7.h, z3.h[3]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.h, z6.h, z4.h[3]\n"
-      "fmla z28.h, z6.h, z5.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z7.h, z4.h[3]\n"
+      "fmla z28.h, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z9.h, z6.h, z0.h[3]\n"
+      "fmla z13.h, z6.h, z1.h[3]\n"
       "addvl x10, x10, #1\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "fmla z25.h, z7.h, z4.h[3]\n"
-      "fmla z29.h, z7.h, z5.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z17.h, z6.h, z2.h[3]\n"
+      "fmla z21.h, z6.h, z3.h[3]\n"
+      "fmla z25.h, z6.h, z4.h[3]\n"
+      "fmla z29.h, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z26.h, z6.h, z4.h[3]\n"
-      "fmla z30.h, z6.h, z5.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "fmla z27.h, z7.h, z4.h[3]\n"
-      "fmla z31.h, z7.h, z5.h[3]\n"
+      "fmla z10.h, z7.h, z0.h[3]\n"
+      "fmla z14.h, z7.h, z1.h[3]\n"
+      "fmla z18.h, z7.h, z2.h[3]\n"
+      "fmla z22.h, z7.h, z3.h[3]\n"
+      "fmla z26.h, z7.h, z4.h[3]\n"
+      "fmla z30.h, z7.h, z5.h[3]\n"
+      "fmla z11.h, z6.h, z0.h[3]\n"
+      "fmla z15.h, z6.h, z1.h[3]\n"
+      "fmla z19.h, z6.h, z2.h[3]\n"
+      "fmla z23.h, z6.h, z3.h[3]\n"
+      "fmla z27.h, z6.h, z4.h[3]\n"
+      "fmla z31.h, z6.h, z5.h[3]\n"
       "ble 82f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[4]\n"
+      "fmla z12.h, z7.h, z1.h[4]\n"
+      "fmla z16.h, z7.h, z2.h[4]\n"
+      "fmla z20.h, z7.h, z3.h[4]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.h, z6.h, z4.h[4]\n"
-      "fmla z28.h, z6.h, z5.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z7.h, z4.h[4]\n"
+      "fmla z28.h, z7.h, z5.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z9.h, z6.h, z0.h[4]\n"
+      "fmla z13.h, z6.h, z1.h[4]\n"
       "addvl x10, x10, #1\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "fmla z25.h, z7.h, z4.h[4]\n"
-      "fmla z29.h, z7.h, z5.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z17.h, z6.h, z2.h[4]\n"
+      "fmla z21.h, z6.h, z3.h[4]\n"
+      "fmla z25.h, z6.h, z4.h[4]\n"
+      "fmla z29.h, z6.h, z5.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z26.h, z6.h, z4.h[4]\n"
-      "fmla z30.h, z6.h, z5.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "fmla z27.h, z7.h, z4.h[4]\n"
-      "fmla z31.h, z7.h, z5.h[4]\n"
+      "fmla z10.h, z7.h, z0.h[4]\n"
+      "fmla z14.h, z7.h, z1.h[4]\n"
+      "fmla z18.h, z7.h, z2.h[4]\n"
+      "fmla z22.h, z7.h, z3.h[4]\n"
+      "fmla z26.h, z7.h, z4.h[4]\n"
+      "fmla z30.h, z7.h, z5.h[4]\n"
+      "fmla z11.h, z6.h, z0.h[4]\n"
+      "fmla z15.h, z6.h, z1.h[4]\n"
+      "fmla z19.h, z6.h, z2.h[4]\n"
+      "fmla z23.h, z6.h, z3.h[4]\n"
+      "fmla z27.h, z6.h, z4.h[4]\n"
+      "fmla z31.h, z6.h, z5.h[4]\n"
       "ble 82f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[5]\n"
+      "fmla z12.h, z7.h, z1.h[5]\n"
+      "fmla z16.h, z7.h, z2.h[5]\n"
+      "fmla z20.h, z7.h, z3.h[5]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.h, z6.h, z4.h[5]\n"
-      "fmla z28.h, z6.h, z5.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z7.h, z4.h[5]\n"
+      "fmla z28.h, z7.h, z5.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z9.h, z6.h, z0.h[5]\n"
+      "fmla z13.h, z6.h, z1.h[5]\n"
       "addvl x10, x10, #1\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "fmla z25.h, z7.h, z4.h[5]\n"
-      "fmla z29.h, z7.h, z5.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z17.h, z6.h, z2.h[5]\n"
+      "fmla z21.h, z6.h, z3.h[5]\n"
+      "fmla z25.h, z6.h, z4.h[5]\n"
+      "fmla z29.h, z6.h, z5.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z26.h, z6.h, z4.h[5]\n"
-      "fmla z30.h, z6.h, z5.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "fmla z27.h, z7.h, z4.h[5]\n"
-      "fmla z31.h, z7.h, z5.h[5]\n"
+      "fmla z10.h, z7.h, z0.h[5]\n"
+      "fmla z14.h, z7.h, z1.h[5]\n"
+      "fmla z18.h, z7.h, z2.h[5]\n"
+      "fmla z22.h, z7.h, z3.h[5]\n"
+      "fmla z26.h, z7.h, z4.h[5]\n"
+      "fmla z30.h, z7.h, z5.h[5]\n"
+      "fmla z11.h, z6.h, z0.h[5]\n"
+      "fmla z15.h, z6.h, z1.h[5]\n"
+      "fmla z19.h, z6.h, z2.h[5]\n"
+      "fmla z23.h, z6.h, z3.h[5]\n"
+      "fmla z27.h, z6.h, z4.h[5]\n"
+      "fmla z31.h, z6.h, z5.h[5]\n"
       "ble 82f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[6]\n"
+      "fmla z12.h, z7.h, z1.h[6]\n"
+      "fmla z16.h, z7.h, z2.h[6]\n"
+      "fmla z20.h, z7.h, z3.h[6]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.h, z6.h, z4.h[6]\n"
-      "fmla z28.h, z6.h, z5.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z7.h, z4.h[6]\n"
+      "fmla z28.h, z7.h, z5.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z9.h, z6.h, z0.h[6]\n"
+      "fmla z13.h, z6.h, z1.h[6]\n"
       "addvl x10, x10, #1\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "fmla z25.h, z7.h, z4.h[6]\n"
-      "fmla z29.h, z7.h, z5.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z17.h, z6.h, z2.h[6]\n"
+      "fmla z21.h, z6.h, z3.h[6]\n"
+      "fmla z25.h, z6.h, z4.h[6]\n"
+      "fmla z29.h, z6.h, z5.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z26.h, z6.h, z4.h[6]\n"
-      "fmla z30.h, z6.h, z5.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "fmla z27.h, z7.h, z4.h[6]\n"
-      "fmla z31.h, z7.h, z5.h[6]\n"
+      "fmla z10.h, z7.h, z0.h[6]\n"
+      "fmla z14.h, z7.h, z1.h[6]\n"
+      "fmla z18.h, z7.h, z2.h[6]\n"
+      "fmla z22.h, z7.h, z3.h[6]\n"
+      "fmla z26.h, z7.h, z4.h[6]\n"
+      "fmla z30.h, z7.h, z5.h[6]\n"
+      "fmla z11.h, z6.h, z0.h[6]\n"
+      "fmla z15.h, z6.h, z1.h[6]\n"
+      "fmla z19.h, z6.h, z2.h[6]\n"
+      "fmla z23.h, z6.h, z3.h[6]\n"
+      "fmla z27.h, z6.h, z4.h[6]\n"
+      "fmla z31.h, z6.h, z5.h[6]\n"
       "ble 82f\n"
-      "ld1h { z6.h }, p5/Z, [x12]\n"
-      "ld1h { z7.h }, p5/Z, [x11]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x12]\n"
+      "ld1h { z6.h }, p5/Z, [x11]\n"
+      "fmla z8.h, z7.h, z0.h[7]\n"
+      "fmla z12.h, z7.h, z1.h[7]\n"
+      "fmla z16.h, z7.h, z2.h[7]\n"
+      "fmla z20.h, z7.h, z3.h[7]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z24.h, z6.h, z4.h[7]\n"
-      "fmla z28.h, z6.h, z5.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
+      "fmla z24.h, z7.h, z4.h[7]\n"
+      "fmla z28.h, z7.h, z5.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
       "addvl x10, x10, #1\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "fmla z25.h, z7.h, z4.h[7]\n"
-      "fmla z29.h, z7.h, z5.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x9]\n"
+      "fmla z9.h, z6.h, z0.h[7]\n"
+      "fmla z13.h, z6.h, z1.h[7]\n"
+      "fmla z17.h, z6.h, z2.h[7]\n"
+      "fmla z21.h, z6.h, z3.h[7]\n"
+      "fmla z25.h, z6.h, z4.h[7]\n"
+      "fmla z29.h, z6.h, z5.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z26.h, z6.h, z4.h[7]\n"
-      "fmla z30.h, z6.h, z5.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
-      "fmla z27.h, z7.h, z4.h[7]\n"
-      "fmla z31.h, z7.h, z5.h[7]\n"
+      "fmla z10.h, z7.h, z0.h[7]\n"
+      "fmla z14.h, z7.h, z1.h[7]\n"
+      "fmla z18.h, z7.h, z2.h[7]\n"
+      "fmla z22.h, z7.h, z3.h[7]\n"
+      "fmla z26.h, z7.h, z4.h[7]\n"
+      "fmla z30.h, z7.h, z5.h[7]\n"
+      "fmla z11.h, z6.h, z0.h[7]\n"
+      "fmla z15.h, z6.h, z1.h[7]\n"
+      "fmla z19.h, z6.h, z2.h[7]\n"
+      "fmla z23.h, z6.h, z3.h[7]\n"
+      "fmla z27.h, z6.h, z4.h[7]\n"
+      "fmla z31.h, z6.h, z5.h[7]\n"
       "82:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -3315,4 +3315,4 @@
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp
index b4c124c..3a93a2f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp
index 32fcac3..8e4fd43 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp
@@ -163,11 +163,11 @@
       "7:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 8f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 9f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -183,12 +183,12 @@
       "10:"  // Height 1: Multiply loop: Main loop
       "fmla z8.s, p4/M, z6.s, z0.s\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
-      "ld1w { z6.s }, p4/Z, [x10]\n"
-      "ld1w { z7.s }, p4/Z, [x9]\n"
+      "ld1w { z17.s }, p4/Z, [x10]\n"
+      "ld1w { z16.s }, p4/Z, [x9]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
       "add x26, x26, #0x4\n"
       "subs x27, x27, #0x1\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -201,12 +201,12 @@
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "fmla z8.s, p4/M, z6.s, z0.s\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
-      "ld1w { z6.s }, p4/Z, [x10]\n"
-      "ld1w { z7.s }, p4/Z, [x9]\n"
+      "ld1w { z17.s }, p4/Z, [x10]\n"
+      "ld1w { z16.s }, p4/Z, [x9]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
@@ -214,17 +214,17 @@
       "bne 7b\n"
       "tbz %x[flags], #1, 12f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p4/Z, [x20]\n"
+      "ld1rw { z17.s }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p4/Z, [x20]\n"
-      "fmin z8.s, p4/M, z8.s, z1.s\n"
-      "fmin z9.s, p4/M, z9.s, z1.s\n"
-      "fmin z10.s, p4/M, z10.s, z1.s\n"
-      "fmin z11.s, p4/M, z11.s, z1.s\n"
-      "fmax z8.s, p4/M, z8.s, z0.s\n"
-      "fmax z9.s, p4/M, z9.s, z0.s\n"
-      "fmax z10.s, p4/M, z10.s, z0.s\n"
-      "fmax z11.s, p4/M, z11.s, z0.s\n"
+      "ld1rw { z16.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z17.s\n"
+      "fmin z9.s, p4/M, z9.s, z17.s\n"
+      "fmin z10.s, p4/M, z10.s, z17.s\n"
+      "fmin z11.s, p4/M, z11.s, z17.s\n"
+      "fmax z8.s, p4/M, z8.s, z16.s\n"
+      "fmax z9.s, p4/M, z9.s, z16.s\n"
+      "fmax z10.s, p4/M, z10.s, z16.s\n"
+      "fmax z11.s, p4/M, z11.s, z16.s\n"
       "12:"  // Height 1: No activation
       "st1w { z8.s }, p3, [x13]\n"
       "st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
@@ -285,15 +285,15 @@
       "17:"  // Height 2: no bias
       "tbz %x[flags], #0, 18f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
+      "add x20, x13, x20, LSL #2\n"
       "ld1w { z8.s }, p3/Z, [x13]\n"
       "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x25]\n"
-      "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x20]\n"
+      "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 19f\n"
       "18:"  // Height 2: no accumulate
       "mov z8.b, #0x0\n"
@@ -309,12 +309,12 @@
       "20:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 21f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 22f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -322,7 +322,7 @@
       "b 22f\n"
       "21:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
       "22:"  // Height 2: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -333,19 +333,19 @@
       "23:"  // Height 2: Multiply loop: Main loop
       "fmla z8.s, p4/M, z6.s, z0.s\n"
       "fmla z12.s, p4/M, z6.s, z1.s\n"
-      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z17.s }, p4/Z, [x10]\n"
       "addvl x12, x12, #1\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
-      "ld1w { z7.s }, p4/Z, [x9]\n"
+      "ld1w { z16.s }, p4/Z, [x9]\n"
       "addvl x11, x11, #1\n"
       "add x26, x26, #0x4\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z14.s, p4/M, z17.s, z1.s\n"
       "add x25, x25, #0x4\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
+      "fmla z15.s, p4/M, z16.s, z1.s\n"
       "addvl x10, x10, #1\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
@@ -357,18 +357,18 @@
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "fmla z8.s, p4/M, z6.s, z0.s\n"
       "fmla z12.s, p4/M, z6.s, z1.s\n"
-      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z17.s }, p4/Z, [x10]\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
-      "ld1w { z7.s }, p4/Z, [x9]\n"
+      "ld1w { z16.s }, p4/Z, [x9]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z14.s, p4/M, z17.s, z1.s\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
+      "fmla z15.s, p4/M, z16.s, z1.s\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "bne 20b\n"
@@ -376,25 +376,25 @@
       "add x25, x13, x20, LSL #2\n"
       "tbz %x[flags], #1, 25f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p4/Z, [x20]\n"
+      "ld1rw { z17.s }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p4/Z, [x20]\n"
-      "fmin z8.s, p4/M, z8.s, z1.s\n"
-      "fmin z9.s, p4/M, z9.s, z1.s\n"
-      "fmin z10.s, p4/M, z10.s, z1.s\n"
-      "fmin z11.s, p4/M, z11.s, z1.s\n"
-      "fmin z12.s, p4/M, z12.s, z1.s\n"
-      "fmin z13.s, p4/M, z13.s, z1.s\n"
-      "fmin z14.s, p4/M, z14.s, z1.s\n"
-      "fmin z15.s, p4/M, z15.s, z1.s\n"
-      "fmax z8.s, p4/M, z8.s, z0.s\n"
-      "fmax z9.s, p4/M, z9.s, z0.s\n"
-      "fmax z10.s, p4/M, z10.s, z0.s\n"
-      "fmax z11.s, p4/M, z11.s, z0.s\n"
-      "fmax z12.s, p4/M, z12.s, z0.s\n"
-      "fmax z13.s, p4/M, z13.s, z0.s\n"
-      "fmax z14.s, p4/M, z14.s, z0.s\n"
-      "fmax z15.s, p4/M, z15.s, z0.s\n"
+      "ld1rw { z16.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z17.s\n"
+      "fmin z9.s, p4/M, z9.s, z17.s\n"
+      "fmin z10.s, p4/M, z10.s, z17.s\n"
+      "fmin z11.s, p4/M, z11.s, z17.s\n"
+      "fmin z12.s, p4/M, z12.s, z17.s\n"
+      "fmin z13.s, p4/M, z13.s, z17.s\n"
+      "fmin z14.s, p4/M, z14.s, z17.s\n"
+      "fmin z15.s, p4/M, z15.s, z17.s\n"
+      "fmax z8.s, p4/M, z8.s, z16.s\n"
+      "fmax z9.s, p4/M, z9.s, z16.s\n"
+      "fmax z10.s, p4/M, z10.s, z16.s\n"
+      "fmax z11.s, p4/M, z11.s, z16.s\n"
+      "fmax z12.s, p4/M, z12.s, z16.s\n"
+      "fmax z13.s, p4/M, z13.s, z16.s\n"
+      "fmax z14.s, p4/M, z14.s, z16.s\n"
+      "fmax z15.s, p4/M, z15.s, z16.s\n"
       "25:"  // Height 2: No activation
       "st1w { z8.s }, p3, [x13]\n"
       "st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
@@ -463,20 +463,20 @@
       "30:"  // Height 3: no bias
       "tbz %x[flags], #0, 31f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x21, x13, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z8.s }, p3/Z, [x13]\n"
       "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x25]\n"
-      "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p3/Z, [x24]\n"
-      "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x21]\n"
+      "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x20]\n"
+      "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 32f\n"
       "31:"  // Height 3: no accumulate
       "mov z8.b, #0x0\n"
@@ -496,13 +496,13 @@
       "33:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 34f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 35f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -511,8 +511,8 @@
       "b 35f\n"
       "34:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "35:"  // Height 3: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -528,22 +528,22 @@
       "addvl x11, x11, #1\n"
       "fmla z16.s, p4/M, z6.s, z2.s\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
-      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z21.s }, p4/Z, [x10]\n"
       "add x26, x26, #0x4\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
       "fmla z17.s, p4/M, z7.s, z2.s\n"
-      "ld1w { z7.s }, p4/Z, [x9]\n"
+      "ld1w { z20.s }, p4/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
       "add x25, x25, #0x4\n"
       "add x24, x24, #0x4\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
-      "fmla z18.s, p4/M, z6.s, z2.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z10.s, p4/M, z21.s, z0.s\n"
+      "fmla z14.s, p4/M, z21.s, z1.s\n"
+      "fmla z18.s, p4/M, z21.s, z2.s\n"
+      "fmla z11.s, p4/M, z20.s, z0.s\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
-      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z15.s, p4/M, z20.s, z1.s\n"
+      "fmla z19.s, p4/M, z20.s, z2.s\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
       "ld1rw { z2.s }, p4/Z, [x24]\n"
@@ -557,54 +557,54 @@
       "add x28, x28, #0x1\n"
       "fmla z16.s, p4/M, z6.s, z2.s\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
-      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z21.s }, p4/Z, [x10]\n"
       "cmp x28, x20\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
       "fmla z17.s, p4/M, z7.s, z2.s\n"
-      "ld1w { z7.s }, p4/Z, [x9]\n"
+      "ld1w { z20.s }, p4/Z, [x9]\n"
       "addvl x12, x12, #1\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z10.s, p4/M, z21.s, z0.s\n"
+      "fmla z14.s, p4/M, z21.s, z1.s\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z18.s, p4/M, z6.s, z2.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z18.s, p4/M, z21.s, z2.s\n"
+      "fmla z11.s, p4/M, z20.s, z0.s\n"
       "addvl x9, x9, #1\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
-      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z15.s, p4/M, z20.s, z1.s\n"
+      "fmla z19.s, p4/M, z20.s, z2.s\n"
       "bne 33b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x25, x13, x20, LSL #2\n"
       "add x24, x25, x20, LSL #2\n"
       "tbz %x[flags], #1, 38f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p4/Z, [x20]\n"
+      "ld1rw { z21.s }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p4/Z, [x20]\n"
-      "fmin z8.s, p4/M, z8.s, z1.s\n"
-      "fmin z9.s, p4/M, z9.s, z1.s\n"
-      "fmin z10.s, p4/M, z10.s, z1.s\n"
-      "fmin z11.s, p4/M, z11.s, z1.s\n"
-      "fmin z12.s, p4/M, z12.s, z1.s\n"
-      "fmin z13.s, p4/M, z13.s, z1.s\n"
-      "fmin z14.s, p4/M, z14.s, z1.s\n"
-      "fmin z15.s, p4/M, z15.s, z1.s\n"
-      "fmin z16.s, p4/M, z16.s, z1.s\n"
-      "fmin z17.s, p4/M, z17.s, z1.s\n"
-      "fmin z18.s, p4/M, z18.s, z1.s\n"
-      "fmin z19.s, p4/M, z19.s, z1.s\n"
-      "fmax z8.s, p4/M, z8.s, z0.s\n"
-      "fmax z9.s, p4/M, z9.s, z0.s\n"
-      "fmax z10.s, p4/M, z10.s, z0.s\n"
-      "fmax z11.s, p4/M, z11.s, z0.s\n"
-      "fmax z12.s, p4/M, z12.s, z0.s\n"
-      "fmax z13.s, p4/M, z13.s, z0.s\n"
-      "fmax z14.s, p4/M, z14.s, z0.s\n"
-      "fmax z15.s, p4/M, z15.s, z0.s\n"
-      "fmax z16.s, p4/M, z16.s, z0.s\n"
-      "fmax z17.s, p4/M, z17.s, z0.s\n"
-      "fmax z18.s, p4/M, z18.s, z0.s\n"
-      "fmax z19.s, p4/M, z19.s, z0.s\n"
+      "ld1rw { z20.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z21.s\n"
+      "fmin z9.s, p4/M, z9.s, z21.s\n"
+      "fmin z10.s, p4/M, z10.s, z21.s\n"
+      "fmin z11.s, p4/M, z11.s, z21.s\n"
+      "fmin z12.s, p4/M, z12.s, z21.s\n"
+      "fmin z13.s, p4/M, z13.s, z21.s\n"
+      "fmin z14.s, p4/M, z14.s, z21.s\n"
+      "fmin z15.s, p4/M, z15.s, z21.s\n"
+      "fmin z16.s, p4/M, z16.s, z21.s\n"
+      "fmin z17.s, p4/M, z17.s, z21.s\n"
+      "fmin z18.s, p4/M, z18.s, z21.s\n"
+      "fmin z19.s, p4/M, z19.s, z21.s\n"
+      "fmax z8.s, p4/M, z8.s, z20.s\n"
+      "fmax z9.s, p4/M, z9.s, z20.s\n"
+      "fmax z10.s, p4/M, z10.s, z20.s\n"
+      "fmax z11.s, p4/M, z11.s, z20.s\n"
+      "fmax z12.s, p4/M, z12.s, z20.s\n"
+      "fmax z13.s, p4/M, z13.s, z20.s\n"
+      "fmax z14.s, p4/M, z14.s, z20.s\n"
+      "fmax z15.s, p4/M, z15.s, z20.s\n"
+      "fmax z16.s, p4/M, z16.s, z20.s\n"
+      "fmax z17.s, p4/M, z17.s, z20.s\n"
+      "fmax z18.s, p4/M, z18.s, z20.s\n"
+      "fmax z19.s, p4/M, z19.s, z20.s\n"
       "38:"  // Height 3: No activation
       "st1w { z8.s }, p3, [x13]\n"
       "st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
@@ -681,25 +681,25 @@
       "43:"  // Height 4: no bias
       "tbz %x[flags], #0, 44f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x22, x13, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z8.s }, p3/Z, [x13]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x25]\n"
-      "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p3/Z, [x24]\n"
-      "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p3/Z, [x23]\n"
-      "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x22]\n"
+      "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x21]\n"
+      "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x20]\n"
+      "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 45f\n"
       "44:"  // Height 4: no accumulate
       "mov z8.b, #0x0\n"
@@ -723,14 +723,14 @@
       "46:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 47f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 48f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -740,9 +740,9 @@
       "b 48f\n"
       "47:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "48:"  // Height 4: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -759,7 +759,7 @@
       "addvl x11, x11, #1\n"
       "fmla z16.s, p4/M, z6.s, z2.s\n"
       "fmla z20.s, p4/M, z6.s, z3.s\n"
-      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z25.s }, p4/Z, [x10]\n"
       "add x26, x26, #0x4\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
@@ -767,22 +767,22 @@
       "add x25, x25, #0x4\n"
       "fmla z17.s, p4/M, z7.s, z2.s\n"
       "fmla z21.s, p4/M, z7.s, z3.s\n"
-      "ld1w { z7.s }, p4/Z, [x9]\n"
+      "ld1w { z24.s }, p4/Z, [x9]\n"
       "add x24, x24, #0x4\n"
       "add x23, x23, #0x4\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z10.s, p4/M, z25.s, z0.s\n"
+      "fmla z14.s, p4/M, z25.s, z1.s\n"
       "addvl x10, x10, #1\n"
-      "fmla z18.s, p4/M, z6.s, z2.s\n"
-      "fmla z22.s, p4/M, z6.s, z3.s\n"
+      "fmla z18.s, p4/M, z25.s, z2.s\n"
+      "fmla z22.s, p4/M, z25.s, z3.s\n"
       "addvl x9, x9, #1\n"
       "ld1w { z6.s }, p4/Z, [x12]\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "fmla z11.s, p4/M, z24.s, z0.s\n"
+      "fmla z15.s, p4/M, z24.s, z1.s\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
-      "fmla z19.s, p4/M, z7.s, z2.s\n"
-      "fmla z23.s, p4/M, z7.s, z3.s\n"
+      "fmla z19.s, p4/M, z24.s, z2.s\n"
+      "fmla z23.s, p4/M, z24.s, z3.s\n"
       "ld1rw { z2.s }, p4/Z, [x24]\n"
       "ld1rw { z3.s }, p4/Z, [x23]\n"
       "ld1w { z7.s }, p4/Z, [x11]\n"
@@ -794,7 +794,7 @@
       "add x28, x28, #0x1\n"
       "fmla z16.s, p4/M, z6.s, z2.s\n"
       "fmla z20.s, p4/M, z6.s, z3.s\n"
-      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z25.s }, p4/Z, [x10]\n"
       "cmp x28, x20\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
@@ -802,17 +802,17 @@
       "addvl x11, x11, #1\n"
       "fmla z17.s, p4/M, z7.s, z2.s\n"
       "fmla z21.s, p4/M, z7.s, z3.s\n"
-      "ld1w { z7.s }, p4/Z, [x9]\n"
+      "ld1w { z24.s }, p4/Z, [x9]\n"
       "addvl x10, x10, #1\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z10.s, p4/M, z25.s, z0.s\n"
+      "fmla z14.s, p4/M, z25.s, z1.s\n"
       "addvl x9, x9, #1\n"
-      "fmla z18.s, p4/M, z6.s, z2.s\n"
-      "fmla z22.s, p4/M, z6.s, z3.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
-      "fmla z19.s, p4/M, z7.s, z2.s\n"
-      "fmla z23.s, p4/M, z7.s, z3.s\n"
+      "fmla z18.s, p4/M, z25.s, z2.s\n"
+      "fmla z22.s, p4/M, z25.s, z3.s\n"
+      "fmla z11.s, p4/M, z24.s, z0.s\n"
+      "fmla z15.s, p4/M, z24.s, z1.s\n"
+      "fmla z19.s, p4/M, z24.s, z2.s\n"
+      "fmla z23.s, p4/M, z24.s, z3.s\n"
       "bne 46b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x25, x13, x20, LSL #2\n"
@@ -820,41 +820,41 @@
       "add x23, x24, x20, LSL #2\n"
       "tbz %x[flags], #1, 51f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p4/Z, [x20]\n"
+      "ld1rw { z25.s }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p4/Z, [x20]\n"
-      "fmin z8.s, p4/M, z8.s, z1.s\n"
-      "fmin z9.s, p4/M, z9.s, z1.s\n"
-      "fmin z10.s, p4/M, z10.s, z1.s\n"
-      "fmin z11.s, p4/M, z11.s, z1.s\n"
-      "fmin z12.s, p4/M, z12.s, z1.s\n"
-      "fmin z13.s, p4/M, z13.s, z1.s\n"
-      "fmin z14.s, p4/M, z14.s, z1.s\n"
-      "fmin z15.s, p4/M, z15.s, z1.s\n"
-      "fmin z16.s, p4/M, z16.s, z1.s\n"
-      "fmin z17.s, p4/M, z17.s, z1.s\n"
-      "fmin z18.s, p4/M, z18.s, z1.s\n"
-      "fmin z19.s, p4/M, z19.s, z1.s\n"
-      "fmin z20.s, p4/M, z20.s, z1.s\n"
-      "fmin z21.s, p4/M, z21.s, z1.s\n"
-      "fmin z22.s, p4/M, z22.s, z1.s\n"
-      "fmin z23.s, p4/M, z23.s, z1.s\n"
-      "fmax z8.s, p4/M, z8.s, z0.s\n"
-      "fmax z9.s, p4/M, z9.s, z0.s\n"
-      "fmax z10.s, p4/M, z10.s, z0.s\n"
-      "fmax z11.s, p4/M, z11.s, z0.s\n"
-      "fmax z12.s, p4/M, z12.s, z0.s\n"
-      "fmax z13.s, p4/M, z13.s, z0.s\n"
-      "fmax z14.s, p4/M, z14.s, z0.s\n"
-      "fmax z15.s, p4/M, z15.s, z0.s\n"
-      "fmax z16.s, p4/M, z16.s, z0.s\n"
-      "fmax z17.s, p4/M, z17.s, z0.s\n"
-      "fmax z18.s, p4/M, z18.s, z0.s\n"
-      "fmax z19.s, p4/M, z19.s, z0.s\n"
-      "fmax z20.s, p4/M, z20.s, z0.s\n"
-      "fmax z21.s, p4/M, z21.s, z0.s\n"
-      "fmax z22.s, p4/M, z22.s, z0.s\n"
-      "fmax z23.s, p4/M, z23.s, z0.s\n"
+      "ld1rw { z24.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z25.s\n"
+      "fmin z9.s, p4/M, z9.s, z25.s\n"
+      "fmin z10.s, p4/M, z10.s, z25.s\n"
+      "fmin z11.s, p4/M, z11.s, z25.s\n"
+      "fmin z12.s, p4/M, z12.s, z25.s\n"
+      "fmin z13.s, p4/M, z13.s, z25.s\n"
+      "fmin z14.s, p4/M, z14.s, z25.s\n"
+      "fmin z15.s, p4/M, z15.s, z25.s\n"
+      "fmin z16.s, p4/M, z16.s, z25.s\n"
+      "fmin z17.s, p4/M, z17.s, z25.s\n"
+      "fmin z18.s, p4/M, z18.s, z25.s\n"
+      "fmin z19.s, p4/M, z19.s, z25.s\n"
+      "fmin z20.s, p4/M, z20.s, z25.s\n"
+      "fmin z21.s, p4/M, z21.s, z25.s\n"
+      "fmin z22.s, p4/M, z22.s, z25.s\n"
+      "fmin z23.s, p4/M, z23.s, z25.s\n"
+      "fmax z8.s, p4/M, z8.s, z24.s\n"
+      "fmax z9.s, p4/M, z9.s, z24.s\n"
+      "fmax z10.s, p4/M, z10.s, z24.s\n"
+      "fmax z11.s, p4/M, z11.s, z24.s\n"
+      "fmax z12.s, p4/M, z12.s, z24.s\n"
+      "fmax z13.s, p4/M, z13.s, z24.s\n"
+      "fmax z14.s, p4/M, z14.s, z24.s\n"
+      "fmax z15.s, p4/M, z15.s, z24.s\n"
+      "fmax z16.s, p4/M, z16.s, z24.s\n"
+      "fmax z17.s, p4/M, z17.s, z24.s\n"
+      "fmax z18.s, p4/M, z18.s, z24.s\n"
+      "fmax z19.s, p4/M, z19.s, z24.s\n"
+      "fmax z20.s, p4/M, z20.s, z24.s\n"
+      "fmax z21.s, p4/M, z21.s, z24.s\n"
+      "fmax z22.s, p4/M, z22.s, z24.s\n"
+      "fmax z23.s, p4/M, z23.s, z24.s\n"
       "51:"  // Height 4: No activation
       "st1w { z8.s }, p3, [x13]\n"
       "st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
@@ -939,30 +939,30 @@
       "56:"  // Height 5: no bias
       "tbz %x[flags], #0, 57f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z8.s }, p3/Z, [x13]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x23, x13, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x13]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x25]\n"
-      "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p3/Z, [x24]\n"
-      "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p3/Z, [x23]\n"
-      "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z24.s }, p3/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p1/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x20]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 58f\n"
       "57:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -990,15 +990,15 @@
       "59:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 60f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 61f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -1009,10 +1009,10 @@
       "b 61f\n"
       "60:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "61:"  // Height 5: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -1034,7 +1034,7 @@
       "subs x27, x27, #0x1\n"
       "fmla z24.s, p4/M, z6.s, z4.s\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
-      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z29.s }, p4/Z, [x10]\n"
       "add x25, x25, #0x4\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
       "fmla z17.s, p4/M, z7.s, z2.s\n"
@@ -1042,24 +1042,24 @@
       "add x23, x23, #0x4\n"
       "fmla z21.s, p4/M, z7.s, z3.s\n"
       "fmla z25.s, p4/M, z7.s, z4.s\n"
-      "ld1w { z7.s }, p4/Z, [x9]\n"
+      "ld1w { z28.s }, p4/Z, [x9]\n"
       "add x22, x22, #0x4\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z10.s, p4/M, z29.s, z0.s\n"
+      "fmla z14.s, p4/M, z29.s, z1.s\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
-      "fmla z18.s, p4/M, z6.s, z2.s\n"
-      "fmla z22.s, p4/M, z6.s, z3.s\n"
-      "fmla z26.s, p4/M, z6.s, z4.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z18.s, p4/M, z29.s, z2.s\n"
+      "fmla z22.s, p4/M, z29.s, z3.s\n"
+      "fmla z26.s, p4/M, z29.s, z4.s\n"
+      "fmla z11.s, p4/M, z28.s, z0.s\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1w { z6.s }, p4/Z, [x12]\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
-      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z15.s, p4/M, z28.s, z1.s\n"
+      "fmla z19.s, p4/M, z28.s, z2.s\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
       "ld1rw { z2.s }, p4/Z, [x24]\n"
-      "fmla z23.s, p4/M, z7.s, z3.s\n"
-      "fmla z27.s, p4/M, z7.s, z4.s\n"
+      "fmla z23.s, p4/M, z28.s, z3.s\n"
+      "fmla z27.s, p4/M, z28.s, z4.s\n"
       "ld1rw { z3.s }, p4/Z, [x23]\n"
       "ld1rw { z4.s }, p4/Z, [x22]\n"
       "ld1w { z7.s }, p4/Z, [x11]\n"
@@ -1075,25 +1075,25 @@
       "addvl x12, x12, #1\n"
       "fmla z24.s, p4/M, z6.s, z4.s\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
-      "ld1w { z6.s }, p4/Z, [x10]\n"
+      "ld1w { z29.s }, p4/Z, [x10]\n"
       "addvl x11, x11, #1\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
       "fmla z17.s, p4/M, z7.s, z2.s\n"
       "addvl x10, x10, #1\n"
       "fmla z21.s, p4/M, z7.s, z3.s\n"
       "fmla z25.s, p4/M, z7.s, z4.s\n"
-      "ld1w { z7.s }, p4/Z, [x9]\n"
+      "ld1w { z28.s }, p4/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
-      "fmla z18.s, p4/M, z6.s, z2.s\n"
-      "fmla z22.s, p4/M, z6.s, z3.s\n"
-      "fmla z26.s, p4/M, z6.s, z4.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
-      "fmla z19.s, p4/M, z7.s, z2.s\n"
-      "fmla z23.s, p4/M, z7.s, z3.s\n"
-      "fmla z27.s, p4/M, z7.s, z4.s\n"
+      "fmla z10.s, p4/M, z29.s, z0.s\n"
+      "fmla z14.s, p4/M, z29.s, z1.s\n"
+      "fmla z18.s, p4/M, z29.s, z2.s\n"
+      "fmla z22.s, p4/M, z29.s, z3.s\n"
+      "fmla z26.s, p4/M, z29.s, z4.s\n"
+      "fmla z11.s, p4/M, z28.s, z0.s\n"
+      "fmla z15.s, p4/M, z28.s, z1.s\n"
+      "fmla z19.s, p4/M, z28.s, z2.s\n"
+      "fmla z23.s, p4/M, z28.s, z3.s\n"
+      "fmla z27.s, p4/M, z28.s, z4.s\n"
       "bne 59b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x25, x13, x20, LSL #2\n"
@@ -1102,49 +1102,49 @@
       "add x22, x23, x20, LSL #2\n"
       "tbz %x[flags], #1, 64f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p4/Z, [x20]\n"
+      "ld1rw { z29.s }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p4/Z, [x20]\n"
-      "fmin z8.s, p4/M, z8.s, z1.s\n"
-      "fmin z9.s, p4/M, z9.s, z1.s\n"
-      "fmin z10.s, p4/M, z10.s, z1.s\n"
-      "fmin z11.s, p4/M, z11.s, z1.s\n"
-      "fmin z12.s, p4/M, z12.s, z1.s\n"
-      "fmin z13.s, p4/M, z13.s, z1.s\n"
-      "fmin z14.s, p4/M, z14.s, z1.s\n"
-      "fmin z15.s, p4/M, z15.s, z1.s\n"
-      "fmin z16.s, p4/M, z16.s, z1.s\n"
-      "fmin z17.s, p4/M, z17.s, z1.s\n"
-      "fmin z18.s, p4/M, z18.s, z1.s\n"
-      "fmin z19.s, p4/M, z19.s, z1.s\n"
-      "fmin z20.s, p4/M, z20.s, z1.s\n"
-      "fmin z21.s, p4/M, z21.s, z1.s\n"
-      "fmin z22.s, p4/M, z22.s, z1.s\n"
-      "fmin z23.s, p4/M, z23.s, z1.s\n"
-      "fmin z24.s, p4/M, z24.s, z1.s\n"
-      "fmin z25.s, p4/M, z25.s, z1.s\n"
-      "fmin z26.s, p4/M, z26.s, z1.s\n"
-      "fmin z27.s, p4/M, z27.s, z1.s\n"
-      "fmax z8.s, p4/M, z8.s, z0.s\n"
-      "fmax z9.s, p4/M, z9.s, z0.s\n"
-      "fmax z10.s, p4/M, z10.s, z0.s\n"
-      "fmax z11.s, p4/M, z11.s, z0.s\n"
-      "fmax z12.s, p4/M, z12.s, z0.s\n"
-      "fmax z13.s, p4/M, z13.s, z0.s\n"
-      "fmax z14.s, p4/M, z14.s, z0.s\n"
-      "fmax z15.s, p4/M, z15.s, z0.s\n"
-      "fmax z16.s, p4/M, z16.s, z0.s\n"
-      "fmax z17.s, p4/M, z17.s, z0.s\n"
-      "fmax z18.s, p4/M, z18.s, z0.s\n"
-      "fmax z19.s, p4/M, z19.s, z0.s\n"
-      "fmax z20.s, p4/M, z20.s, z0.s\n"
-      "fmax z21.s, p4/M, z21.s, z0.s\n"
-      "fmax z22.s, p4/M, z22.s, z0.s\n"
-      "fmax z23.s, p4/M, z23.s, z0.s\n"
-      "fmax z24.s, p4/M, z24.s, z0.s\n"
-      "fmax z25.s, p4/M, z25.s, z0.s\n"
-      "fmax z26.s, p4/M, z26.s, z0.s\n"
-      "fmax z27.s, p4/M, z27.s, z0.s\n"
+      "ld1rw { z28.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z29.s\n"
+      "fmin z9.s, p4/M, z9.s, z29.s\n"
+      "fmin z10.s, p4/M, z10.s, z29.s\n"
+      "fmin z11.s, p4/M, z11.s, z29.s\n"
+      "fmin z12.s, p4/M, z12.s, z29.s\n"
+      "fmin z13.s, p4/M, z13.s, z29.s\n"
+      "fmin z14.s, p4/M, z14.s, z29.s\n"
+      "fmin z15.s, p4/M, z15.s, z29.s\n"
+      "fmin z16.s, p4/M, z16.s, z29.s\n"
+      "fmin z17.s, p4/M, z17.s, z29.s\n"
+      "fmin z18.s, p4/M, z18.s, z29.s\n"
+      "fmin z19.s, p4/M, z19.s, z29.s\n"
+      "fmin z20.s, p4/M, z20.s, z29.s\n"
+      "fmin z21.s, p4/M, z21.s, z29.s\n"
+      "fmin z22.s, p4/M, z22.s, z29.s\n"
+      "fmin z23.s, p4/M, z23.s, z29.s\n"
+      "fmin z24.s, p4/M, z24.s, z29.s\n"
+      "fmin z25.s, p4/M, z25.s, z29.s\n"
+      "fmin z26.s, p4/M, z26.s, z29.s\n"
+      "fmin z27.s, p4/M, z27.s, z29.s\n"
+      "fmax z8.s, p4/M, z8.s, z28.s\n"
+      "fmax z9.s, p4/M, z9.s, z28.s\n"
+      "fmax z10.s, p4/M, z10.s, z28.s\n"
+      "fmax z11.s, p4/M, z11.s, z28.s\n"
+      "fmax z12.s, p4/M, z12.s, z28.s\n"
+      "fmax z13.s, p4/M, z13.s, z28.s\n"
+      "fmax z14.s, p4/M, z14.s, z28.s\n"
+      "fmax z15.s, p4/M, z15.s, z28.s\n"
+      "fmax z16.s, p4/M, z16.s, z28.s\n"
+      "fmax z17.s, p4/M, z17.s, z28.s\n"
+      "fmax z18.s, p4/M, z18.s, z28.s\n"
+      "fmax z19.s, p4/M, z19.s, z28.s\n"
+      "fmax z20.s, p4/M, z20.s, z28.s\n"
+      "fmax z21.s, p4/M, z21.s, z28.s\n"
+      "fmax z22.s, p4/M, z22.s, z28.s\n"
+      "fmax z23.s, p4/M, z23.s, z28.s\n"
+      "fmax z24.s, p4/M, z24.s, z28.s\n"
+      "fmax z25.s, p4/M, z25.s, z28.s\n"
+      "fmax z26.s, p4/M, z26.s, z28.s\n"
+      "fmax z27.s, p4/M, z27.s, z28.s\n"
       "64:"  // Height 5: No activation
       "st1w { z8.s }, p3, [x13]\n"
       "st1w { z9.s }, p2, [x13, #1, MUL VL]\n"
@@ -1240,35 +1240,35 @@
       "69:"  // Height 6: no bias
       "tbz %x[flags], #0, 70f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z8.s }, p3/Z, [x13]\n"
+      "add x24, x13, x20, LSL #2\n"
       "add x23, x24, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x13]\n"
       "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x25]\n"
-      "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p3/Z, [x24]\n"
-      "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p3/Z, [x23]\n"
-      "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z24.s }, p3/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p1/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p0/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p3/Z, [x21]\n"
-      "ld1w { z29.s }, p2/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z30.s }, p1/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z31.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x24]\n"
+      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23]\n"
+      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x22]\n"
+      "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x21]\n"
+      "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p3/Z, [x20]\n"
+      "ld1w { z29.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z31.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 71f\n"
       "70:"  // Height 6: no accumulate
       "mov z8.b, #0x0\n"
@@ -1300,16 +1300,16 @@
       "72:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 73f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 74f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -1321,11 +1321,11 @@
       "b 74f\n"
       "73:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "74:"  // Height 6: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -1527,4 +1527,4 @@
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp
index eb057e7..b1ab31e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp
@@ -163,11 +163,11 @@
       "7:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 8f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 9f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -180,40 +180,40 @@
       "10:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
       "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x12]\n"
+      "fmla z8.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z9.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10]\n"
+      "fmla z10.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "fmla z11.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z8.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z9.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z10.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z11.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z8.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z9.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z10.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z11.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z8.s, z16.s, z0.s[3]\n"
+      "ld1w { z16.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z9.s, z16.s, z0.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #3, MUL VL]\n"
       "sub x27, x27, #0x4\n"
-      "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x9, #3, MUL VL]\n"
       "cmp x27, #0x4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z10.s, z17.s, z0.s[3]\n"
+      "fmla z11.s, z16.s, z0.s[3]\n"
       "add x26, x26, #0x10\n"
       "addvl x12, x12, #4\n"
       "addvl x11, x11, #4\n"
@@ -223,56 +223,56 @@
       "11:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
       "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x12]\n"
+      "fmla z8.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z9.s, z16.s, z0.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z17.s, z0.s[0]\n"
+      "fmla z11.s, z16.s, z0.s[0]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 12f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z0.s[1]\n"
+      "fmla z9.s, z16.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z10.s, z17.s, z0.s[1]\n"
+      "fmla z11.s, z16.s, z0.s[1]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 12f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z0.s[2]\n"
+      "fmla z9.s, z16.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z10.s, z17.s, z0.s[2]\n"
+      "fmla z11.s, z16.s, z0.s[2]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 12f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z0.s[3]\n"
+      "fmla z9.s, z16.s, z0.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z17.s, z0.s[3]\n"
+      "fmla z11.s, z16.s, z0.s[3]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
@@ -284,17 +284,17 @@
       "bne 7b\n"
       "tbz %x[flags], #1, 13f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
       "13:"  // Height 1: No activation
       "st1w { z8.s }, p4, [x13]\n"
       "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
@@ -355,15 +355,15 @@
       "18:"  // Height 2: no bias
       "tbz %x[flags], #0, 19f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
+      "add x20, x13, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x13]\n"
       "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 20f\n"
       "19:"  // Height 2: no accumulate
       "mov z8.b, #0x0\n"
@@ -379,12 +379,12 @@
       "21:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 22f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 23f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -392,143 +392,143 @@
       "b 23f\n"
       "22:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
       "23:"  // Height 2: input setup done
       "cmp x27, #0x4\n"
       "ble 25f\n"
       "24:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "ld1rqw { z1.s }, p0/Z, [x26]\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "sub x27, x27, #0x4\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z1.s[0]\n"
+      "fmla z12.s, z17.s, z0.s[0]\n"
+      "fmla z9.s, z16.s, z1.s[0]\n"
+      "fmla z13.s, z16.s, z0.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z17.s, z1.s[0]\n"
+      "fmla z14.s, z17.s, z0.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x12, #1, MUL VL]\n"
       "cmp x27, #0x4\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z11.s, z16.s, z1.s[0]\n"
+      "fmla z15.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x11, #1, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z1.s[1]\n"
+      "fmla z12.s, z17.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #1, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z9.s, z16.s, z1.s[1]\n"
+      "fmla z13.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.s, z17.s, z1.s[1]\n"
+      "fmla z14.s, z17.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z11.s, z16.s, z1.s[1]\n"
+      "fmla z15.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.s, z17.s, z1.s[2]\n"
+      "fmla z12.s, z17.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z16.s, z1.s[2]\n"
+      "fmla z13.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.s, z17.s, z1.s[2]\n"
+      "fmla z14.s, z17.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x12, #3, MUL VL]\n"
       "addvl x12, x12, #4\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z11.s, z16.s, z1.s[2]\n"
+      "fmla z15.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x11, #3, MUL VL]\n"
       "addvl x11, x11, #4\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z8.s, z17.s, z1.s[3]\n"
+      "fmla z12.s, z17.s, z0.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z9.s, z16.s, z1.s[3]\n"
+      "fmla z13.s, z16.s, z0.s[3]\n"
+      "ld1w { z16.s }, p5/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z10.s, z17.s, z1.s[3]\n"
+      "fmla z14.s, z17.s, z0.s[3]\n"
+      "fmla z11.s, z16.s, z1.s[3]\n"
+      "fmla z15.s, z16.s, z0.s[3]\n"
       "bgt 24b\n"
       "25:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
       "ld1rqw { z0.s }, p0/Z, [x26]\n"
       "ld1rqw { z1.s }, p0/Z, [x25]\n"
       "subs x27, x27, #0x1\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z0.s[0]\n"
+      "fmla z12.s, z17.s, z1.s[0]\n"
+      "fmla z9.s, z16.s, z0.s[0]\n"
+      "fmla z13.s, z16.s, z1.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z17.s, z0.s[0]\n"
+      "fmla z14.s, z17.s, z1.s[0]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z11.s, z16.s, z0.s[0]\n"
+      "fmla z15.s, z16.s, z1.s[0]\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 26f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z0.s[1]\n"
+      "fmla z12.s, z17.s, z1.s[1]\n"
+      "fmla z9.s, z16.s, z0.s[1]\n"
+      "fmla z13.s, z16.s, z1.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z10.s, z17.s, z0.s[1]\n"
+      "fmla z14.s, z17.s, z1.s[1]\n"
       "addvl x12, x12, #1\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z11.s, z16.s, z0.s[1]\n"
+      "fmla z15.s, z16.s, z1.s[1]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 26f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z0.s[2]\n"
+      "fmla z12.s, z17.s, z1.s[2]\n"
+      "fmla z9.s, z16.s, z0.s[2]\n"
+      "fmla z13.s, z16.s, z1.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z10.s, z17.s, z0.s[2]\n"
+      "fmla z14.s, z17.s, z1.s[2]\n"
       "addvl x12, x12, #1\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z11.s, z16.s, z0.s[2]\n"
+      "fmla z15.s, z16.s, z1.s[2]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "ble 26f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x12]\n"
+      "ld1w { z16.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z17.s, z0.s[3]\n"
+      "fmla z12.s, z17.s, z1.s[3]\n"
+      "fmla z9.s, z16.s, z0.s[3]\n"
+      "fmla z13.s, z16.s, z1.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z17.s, z0.s[3]\n"
+      "fmla z14.s, z17.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z11.s, z16.s, z0.s[3]\n"
+      "fmla z15.s, z16.s, z1.s[3]\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
       "26:"  // Height 2: Multiply loop: multiply skip
@@ -540,25 +540,25 @@
       "add x25, x13, x20, LSL #2\n"
       "tbz %x[flags], #1, 27f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmin z12.s, p5/M, z12.s, z17.s\n"
+      "fmin z13.s, p5/M, z13.s, z17.s\n"
+      "fmin z14.s, p5/M, z14.s, z17.s\n"
+      "fmin z15.s, p5/M, z15.s, z17.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
+      "fmax z12.s, p5/M, z12.s, z16.s\n"
+      "fmax z13.s, p5/M, z13.s, z16.s\n"
+      "fmax z14.s, p5/M, z14.s, z16.s\n"
+      "fmax z15.s, p5/M, z15.s, z16.s\n"
       "27:"  // Height 2: No activation
       "st1w { z8.s }, p4, [x13]\n"
       "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
@@ -627,20 +627,20 @@
       "32:"  // Height 3: no bias
       "tbz %x[flags], #0, 33f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x21, x13, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x13]\n"
       "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x24]\n"
-      "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x20]\n"
+      "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 34f\n"
       "33:"  // Height 3: no accumulate
       "mov z8.b, #0x0\n"
@@ -660,13 +660,13 @@
       "35:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 36f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 37f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -675,89 +675,89 @@
       "b 37f\n"
       "36:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "37:"  // Height 3: input setup done
       "cmp x27, #0x4\n"
       "ble 39f\n"
       "38:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
       "ld1rqw { z1.s }, p0/Z, [x25]\n"
       "sub x27, x27, #0x4\n"
-      "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "ld1rqw { z0.s }, p0/Z, [x24]\n"
+      "ld1w { z21.s }, p5/Z, [x12]\n"
+      "fmla z8.s, z21.s, z2.s[0]\n"
+      "fmla z12.s, z21.s, z1.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x11]\n"
+      "fmla z16.s, z21.s, z0.s[0]\n"
+      "fmla z9.s, z20.s, z2.s[0]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "fmla z13.s, z20.s, z1.s[0]\n"
+      "fmla z17.s, z20.s, z0.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x9]\n"
       "cmp x27, #0x4\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z10.s, z21.s, z2.s[0]\n"
+      "fmla z14.s, z21.s, z1.s[0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z18.s, z21.s, z0.s[0]\n"
+      "fmla z11.s, z20.s, z2.s[0]\n"
+      "ld1w { z21.s }, p5/Z, [x12, #1, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z15.s, z20.s, z1.s[0]\n"
+      "fmla z19.s, z20.s, z0.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.s, z21.s, z2.s[1]\n"
+      "fmla z12.s, z21.s, z1.s[1]\n"
+      "fmla z16.s, z21.s, z0.s[1]\n"
+      "fmla z9.s, z20.s, z2.s[1]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[1]\n"
+      "fmla z17.s, z20.s, z0.s[1]\n"
+      "ld1w { z20.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.s, z21.s, z2.s[1]\n"
+      "fmla z14.s, z21.s, z1.s[1]\n"
+      "fmla z18.s, z21.s, z0.s[1]\n"
+      "fmla z11.s, z20.s, z2.s[1]\n"
+      "ld1w { z21.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z15.s, z20.s, z1.s[1]\n"
+      "fmla z19.s, z20.s, z0.s[1]\n"
+      "ld1w { z20.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.s, z21.s, z2.s[2]\n"
+      "fmla z12.s, z21.s, z1.s[2]\n"
+      "fmla z16.s, z21.s, z0.s[2]\n"
+      "fmla z9.s, z20.s, z2.s[2]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[2]\n"
+      "fmla z17.s, z20.s, z0.s[2]\n"
+      "ld1w { z20.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.s, z21.s, z2.s[2]\n"
+      "fmla z14.s, z21.s, z1.s[2]\n"
+      "fmla z18.s, z21.s, z0.s[2]\n"
+      "fmla z11.s, z20.s, z2.s[2]\n"
+      "ld1w { z21.s }, p5/Z, [x12, #3, MUL VL]\n"
       "addvl x12, x12, #4\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z15.s, z20.s, z1.s[2]\n"
+      "fmla z19.s, z20.s, z0.s[2]\n"
+      "ld1w { z20.s }, p5/Z, [x11, #3, MUL VL]\n"
       "addvl x11, x11, #4\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z8.s, z21.s, z2.s[3]\n"
+      "fmla z12.s, z21.s, z1.s[3]\n"
+      "fmla z16.s, z21.s, z0.s[3]\n"
+      "fmla z9.s, z20.s, z2.s[3]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[3]\n"
+      "fmla z17.s, z20.s, z0.s[3]\n"
+      "ld1w { z20.s }, p5/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z10.s, z21.s, z2.s[3]\n"
+      "fmla z14.s, z21.s, z1.s[3]\n"
+      "fmla z18.s, z21.s, z0.s[3]\n"
+      "fmla z11.s, z20.s, z2.s[3]\n"
+      "fmla z15.s, z20.s, z1.s[3]\n"
+      "fmla z19.s, z20.s, z0.s[3]\n"
       "bgt 38b\n"
       "39:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
@@ -765,91 +765,91 @@
       "ld1rqw { z1.s }, p0/Z, [x25]\n"
       "subs x27, x27, #0x1\n"
       "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "ld1w { z21.s }, p5/Z, [x12]\n"
+      "fmla z8.s, z21.s, z0.s[0]\n"
+      "fmla z12.s, z21.s, z1.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x11]\n"
+      "fmla z16.s, z21.s, z2.s[0]\n"
+      "fmla z9.s, z20.s, z0.s[0]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "fmla z13.s, z20.s, z1.s[0]\n"
+      "fmla z17.s, z20.s, z2.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x9]\n"
       "addvl x12, x12, #1\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z10.s, z21.s, z0.s[0]\n"
+      "fmla z14.s, z21.s, z1.s[0]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z18.s, z21.s, z2.s[0]\n"
+      "fmla z11.s, z20.s, z0.s[0]\n"
       "addvl x9, x9, #1\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z15.s, z20.s, z1.s[0]\n"
+      "fmla z19.s, z20.s, z2.s[0]\n"
       "ble 40f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "ld1w { z21.s }, p5/Z, [x12]\n"
+      "ld1w { z20.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z21.s, z0.s[1]\n"
+      "fmla z12.s, z21.s, z1.s[1]\n"
+      "fmla z16.s, z21.s, z2.s[1]\n"
+      "fmla z9.s, z20.s, z0.s[1]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z13.s, z20.s, z1.s[1]\n"
+      "fmla z17.s, z20.s, z2.s[1]\n"
+      "ld1w { z20.s }, p5/Z, [x9]\n"
       "addvl x12, x12, #1\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z10.s, z21.s, z0.s[1]\n"
+      "fmla z14.s, z21.s, z1.s[1]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z18.s, z21.s, z2.s[1]\n"
+      "fmla z11.s, z20.s, z0.s[1]\n"
       "addvl x9, x9, #1\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z15.s, z20.s, z1.s[1]\n"
+      "fmla z19.s, z20.s, z2.s[1]\n"
       "ble 40f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "ld1w { z21.s }, p5/Z, [x12]\n"
+      "ld1w { z20.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z21.s, z0.s[2]\n"
+      "fmla z12.s, z21.s, z1.s[2]\n"
+      "fmla z16.s, z21.s, z2.s[2]\n"
+      "fmla z9.s, z20.s, z0.s[2]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z13.s, z20.s, z1.s[2]\n"
+      "fmla z17.s, z20.s, z2.s[2]\n"
+      "ld1w { z20.s }, p5/Z, [x9]\n"
       "addvl x12, x12, #1\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z10.s, z21.s, z0.s[2]\n"
+      "fmla z14.s, z21.s, z1.s[2]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z18.s, z21.s, z2.s[2]\n"
+      "fmla z11.s, z20.s, z0.s[2]\n"
       "addvl x9, x9, #1\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z15.s, z20.s, z1.s[2]\n"
+      "fmla z19.s, z20.s, z2.s[2]\n"
       "ble 40f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "ld1w { z21.s }, p5/Z, [x12]\n"
+      "ld1w { z20.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z21.s, z0.s[3]\n"
+      "fmla z12.s, z21.s, z1.s[3]\n"
+      "fmla z16.s, z21.s, z2.s[3]\n"
+      "fmla z9.s, z20.s, z0.s[3]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
       "addvl x12, x12, #1\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z13.s, z20.s, z1.s[3]\n"
+      "fmla z17.s, z20.s, z2.s[3]\n"
+      "ld1w { z20.s }, p5/Z, [x9]\n"
       "addvl x11, x11, #1\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z10.s, z21.s, z0.s[3]\n"
+      "fmla z14.s, z21.s, z1.s[3]\n"
       "addvl x10, x10, #1\n"
       "addvl x9, x9, #1\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z18.s, z21.s, z2.s[3]\n"
+      "fmla z11.s, z20.s, z0.s[3]\n"
+      "fmla z15.s, z20.s, z1.s[3]\n"
+      "fmla z19.s, z20.s, z2.s[3]\n"
       "40:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -860,33 +860,33 @@
       "add x24, x25, x20, LSL #2\n"
       "tbz %x[flags], #1, 41f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z21.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "ld1rw { z20.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z21.s\n"
+      "fmin z9.s, p5/M, z9.s, z21.s\n"
+      "fmin z10.s, p5/M, z10.s, z21.s\n"
+      "fmin z11.s, p5/M, z11.s, z21.s\n"
+      "fmin z12.s, p5/M, z12.s, z21.s\n"
+      "fmin z13.s, p5/M, z13.s, z21.s\n"
+      "fmin z14.s, p5/M, z14.s, z21.s\n"
+      "fmin z15.s, p5/M, z15.s, z21.s\n"
+      "fmin z16.s, p5/M, z16.s, z21.s\n"
+      "fmin z17.s, p5/M, z17.s, z21.s\n"
+      "fmin z18.s, p5/M, z18.s, z21.s\n"
+      "fmin z19.s, p5/M, z19.s, z21.s\n"
+      "fmax z8.s, p5/M, z8.s, z20.s\n"
+      "fmax z9.s, p5/M, z9.s, z20.s\n"
+      "fmax z10.s, p5/M, z10.s, z20.s\n"
+      "fmax z11.s, p5/M, z11.s, z20.s\n"
+      "fmax z12.s, p5/M, z12.s, z20.s\n"
+      "fmax z13.s, p5/M, z13.s, z20.s\n"
+      "fmax z14.s, p5/M, z14.s, z20.s\n"
+      "fmax z15.s, p5/M, z15.s, z20.s\n"
+      "fmax z16.s, p5/M, z16.s, z20.s\n"
+      "fmax z17.s, p5/M, z17.s, z20.s\n"
+      "fmax z18.s, p5/M, z18.s, z20.s\n"
+      "fmax z19.s, p5/M, z19.s, z20.s\n"
       "41:"  // Height 3: No activation
       "st1w { z8.s }, p4, [x13]\n"
       "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
@@ -963,25 +963,25 @@
       "46:"  // Height 4: no bias
       "tbz %x[flags], #0, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x22, x13, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x13]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x24]\n"
-      "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x21]\n"
+      "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 48f\n"
       "47:"  // Height 4: no accumulate
       "mov z8.b, #0x0\n"
@@ -1005,14 +1005,14 @@
       "49:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 50f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 51f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -1022,108 +1022,108 @@
       "b 51f\n"
       "50:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "51:"  // Height 4: input setup done
       "cmp x27, #0x4\n"
       "ble 53f\n"
       "52:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "ld1rqw { z3.s }, p0/Z, [x26]\n"
+      "ld1rqw { z2.s }, p0/Z, [x25]\n"
       "sub x27, x27, #0x4\n"
-      "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      "ld1rqw { z3.s }, p0/Z, [x23]\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
       "cmp x27, #0x4\n"
       "add x26, x26, #0x10\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "ld1w { z25.s }, p5/Z, [x12]\n"
+      "ld1w { z24.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z25.s, z3.s[0]\n"
+      "fmla z12.s, z25.s, z2.s[0]\n"
+      "fmla z16.s, z25.s, z1.s[0]\n"
+      "fmla z20.s, z25.s, z0.s[0]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
+      "fmla z9.s, z24.s, z3.s[0]\n"
+      "fmla z13.s, z24.s, z2.s[0]\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z17.s, z24.s, z1.s[0]\n"
+      "fmla z21.s, z24.s, z0.s[0]\n"
+      "ld1w { z24.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z25.s, z3.s[0]\n"
+      "fmla z14.s, z25.s, z2.s[0]\n"
+      "fmla z18.s, z25.s, z1.s[0]\n"
+      "fmla z22.s, z25.s, z0.s[0]\n"
+      "ld1w { z25.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z11.s, z24.s, z3.s[0]\n"
+      "fmla z15.s, z24.s, z2.s[0]\n"
+      "fmla z19.s, z24.s, z1.s[0]\n"
+      "fmla z23.s, z24.s, z0.s[0]\n"
+      "ld1w { z24.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.s, z25.s, z3.s[1]\n"
+      "fmla z12.s, z25.s, z2.s[1]\n"
+      "fmla z16.s, z25.s, z1.s[1]\n"
+      "fmla z20.s, z25.s, z0.s[1]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.s, z24.s, z3.s[1]\n"
+      "fmla z13.s, z24.s, z2.s[1]\n"
+      "fmla z17.s, z24.s, z1.s[1]\n"
+      "fmla z21.s, z24.s, z0.s[1]\n"
+      "ld1w { z24.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.s, z25.s, z3.s[1]\n"
+      "fmla z14.s, z25.s, z2.s[1]\n"
+      "fmla z18.s, z25.s, z1.s[1]\n"
+      "fmla z22.s, z25.s, z0.s[1]\n"
+      "ld1w { z25.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z11.s, z24.s, z3.s[1]\n"
+      "fmla z15.s, z24.s, z2.s[1]\n"
+      "fmla z19.s, z24.s, z1.s[1]\n"
+      "fmla z23.s, z24.s, z0.s[1]\n"
+      "ld1w { z24.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.s, z25.s, z3.s[2]\n"
+      "fmla z12.s, z25.s, z2.s[2]\n"
+      "fmla z16.s, z25.s, z1.s[2]\n"
+      "fmla z20.s, z25.s, z0.s[2]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z24.s, z3.s[2]\n"
+      "fmla z13.s, z24.s, z2.s[2]\n"
+      "fmla z17.s, z24.s, z1.s[2]\n"
+      "fmla z21.s, z24.s, z0.s[2]\n"
+      "ld1w { z24.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.s, z25.s, z3.s[2]\n"
+      "fmla z14.s, z25.s, z2.s[2]\n"
+      "fmla z18.s, z25.s, z1.s[2]\n"
+      "fmla z22.s, z25.s, z0.s[2]\n"
+      "ld1w { z25.s }, p5/Z, [x12, #3, MUL VL]\n"
       "addvl x12, x12, #4\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z11.s, z24.s, z3.s[2]\n"
+      "fmla z15.s, z24.s, z2.s[2]\n"
+      "fmla z19.s, z24.s, z1.s[2]\n"
+      "fmla z23.s, z24.s, z0.s[2]\n"
+      "ld1w { z24.s }, p5/Z, [x11, #3, MUL VL]\n"
       "addvl x11, x11, #4\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z8.s, z25.s, z3.s[3]\n"
+      "fmla z12.s, z25.s, z2.s[3]\n"
+      "fmla z16.s, z25.s, z1.s[3]\n"
+      "fmla z20.s, z25.s, z0.s[3]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z9.s, z24.s, z3.s[3]\n"
+      "fmla z13.s, z24.s, z2.s[3]\n"
+      "fmla z17.s, z24.s, z1.s[3]\n"
+      "fmla z21.s, z24.s, z0.s[3]\n"
+      "ld1w { z24.s }, p5/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
+      "fmla z10.s, z25.s, z3.s[3]\n"
+      "fmla z14.s, z25.s, z2.s[3]\n"
+      "fmla z18.s, z25.s, z1.s[3]\n"
+      "fmla z22.s, z25.s, z0.s[3]\n"
+      "fmla z11.s, z24.s, z3.s[3]\n"
+      "fmla z15.s, z24.s, z2.s[3]\n"
+      "fmla z19.s, z24.s, z1.s[3]\n"
+      "fmla z23.s, z24.s, z0.s[3]\n"
       "bgt 52b\n"
       "53:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
@@ -1132,107 +1132,107 @@
       "subs x27, x27, #0x1\n"
       "ld1rqw { z2.s }, p0/Z, [x24]\n"
       "ld1rqw { z3.s }, p0/Z, [x23]\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "ld1w { z25.s }, p5/Z, [x12]\n"
+      "ld1w { z24.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z25.s, z0.s[0]\n"
+      "fmla z12.s, z25.s, z1.s[0]\n"
+      "fmla z16.s, z25.s, z2.s[0]\n"
+      "fmla z20.s, z25.s, z3.s[0]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
       "addvl x12, x12, #1\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
+      "fmla z9.s, z24.s, z0.s[0]\n"
+      "fmla z13.s, z24.s, z1.s[0]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z17.s, z24.s, z2.s[0]\n"
+      "fmla z21.s, z24.s, z3.s[0]\n"
+      "ld1w { z24.s }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
+      "fmla z10.s, z25.s, z0.s[0]\n"
+      "fmla z14.s, z25.s, z1.s[0]\n"
+      "fmla z18.s, z25.s, z2.s[0]\n"
+      "fmla z22.s, z25.s, z3.s[0]\n"
+      "fmla z11.s, z24.s, z0.s[0]\n"
+      "fmla z15.s, z24.s, z1.s[0]\n"
+      "fmla z19.s, z24.s, z2.s[0]\n"
+      "fmla z23.s, z24.s, z3.s[0]\n"
       "ble 54f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "ld1w { z25.s }, p5/Z, [x12]\n"
+      "ld1w { z24.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z25.s, z0.s[1]\n"
+      "fmla z12.s, z25.s, z1.s[1]\n"
+      "fmla z16.s, z25.s, z2.s[1]\n"
+      "fmla z20.s, z25.s, z3.s[1]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z9.s, z24.s, z0.s[1]\n"
+      "fmla z13.s, z24.s, z1.s[1]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z17.s, z24.s, z2.s[1]\n"
+      "fmla z21.s, z24.s, z3.s[1]\n"
+      "ld1w { z24.s }, p5/Z, [x9]\n"
       "addvl x10, x10, #1\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z10.s, z25.s, z0.s[1]\n"
+      "fmla z14.s, z25.s, z1.s[1]\n"
       "addvl x9, x9, #1\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
+      "fmla z18.s, z25.s, z2.s[1]\n"
+      "fmla z22.s, z25.s, z3.s[1]\n"
+      "fmla z11.s, z24.s, z0.s[1]\n"
+      "fmla z15.s, z24.s, z1.s[1]\n"
+      "fmla z19.s, z24.s, z2.s[1]\n"
+      "fmla z23.s, z24.s, z3.s[1]\n"
       "ble 54f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "ld1w { z25.s }, p5/Z, [x12]\n"
+      "ld1w { z24.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z25.s, z0.s[2]\n"
+      "fmla z12.s, z25.s, z1.s[2]\n"
+      "fmla z16.s, z25.s, z2.s[2]\n"
+      "fmla z20.s, z25.s, z3.s[2]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z9.s, z24.s, z0.s[2]\n"
+      "fmla z13.s, z24.s, z1.s[2]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z17.s, z24.s, z2.s[2]\n"
+      "fmla z21.s, z24.s, z3.s[2]\n"
+      "ld1w { z24.s }, p5/Z, [x9]\n"
       "addvl x10, x10, #1\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z10.s, z25.s, z0.s[2]\n"
+      "fmla z14.s, z25.s, z1.s[2]\n"
       "addvl x9, x9, #1\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
+      "fmla z18.s, z25.s, z2.s[2]\n"
+      "fmla z22.s, z25.s, z3.s[2]\n"
+      "fmla z11.s, z24.s, z0.s[2]\n"
+      "fmla z15.s, z24.s, z1.s[2]\n"
+      "fmla z19.s, z24.s, z2.s[2]\n"
+      "fmla z23.s, z24.s, z3.s[2]\n"
       "ble 54f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "ld1w { z25.s }, p5/Z, [x12]\n"
+      "ld1w { z24.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z25.s, z0.s[3]\n"
+      "fmla z12.s, z25.s, z1.s[3]\n"
+      "fmla z16.s, z25.s, z2.s[3]\n"
+      "fmla z20.s, z25.s, z3.s[3]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
       "addvl x12, x12, #1\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z9.s, z24.s, z0.s[3]\n"
+      "fmla z13.s, z24.s, z1.s[3]\n"
       "addvl x11, x11, #1\n"
       "addvl x10, x10, #1\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z17.s, z24.s, z2.s[3]\n"
+      "fmla z21.s, z24.s, z3.s[3]\n"
+      "ld1w { z24.s }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
+      "fmla z10.s, z25.s, z0.s[3]\n"
+      "fmla z14.s, z25.s, z1.s[3]\n"
+      "fmla z18.s, z25.s, z2.s[3]\n"
+      "fmla z22.s, z25.s, z3.s[3]\n"
+      "fmla z11.s, z24.s, z0.s[3]\n"
+      "fmla z15.s, z24.s, z1.s[3]\n"
+      "fmla z19.s, z24.s, z2.s[3]\n"
+      "fmla z23.s, z24.s, z3.s[3]\n"
       "54:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1244,41 +1244,41 @@
       "add x23, x24, x20, LSL #2\n"
       "tbz %x[flags], #1, 55f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z25.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmin z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z1.s\n"
-      "fmin z22.s, p5/M, z22.s, z1.s\n"
-      "fmin z23.s, p5/M, z23.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
-      "fmax z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z0.s\n"
-      "fmax z22.s, p5/M, z22.s, z0.s\n"
-      "fmax z23.s, p5/M, z23.s, z0.s\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z25.s\n"
+      "fmin z9.s, p5/M, z9.s, z25.s\n"
+      "fmin z10.s, p5/M, z10.s, z25.s\n"
+      "fmin z11.s, p5/M, z11.s, z25.s\n"
+      "fmin z12.s, p5/M, z12.s, z25.s\n"
+      "fmin z13.s, p5/M, z13.s, z25.s\n"
+      "fmin z14.s, p5/M, z14.s, z25.s\n"
+      "fmin z15.s, p5/M, z15.s, z25.s\n"
+      "fmin z16.s, p5/M, z16.s, z25.s\n"
+      "fmin z17.s, p5/M, z17.s, z25.s\n"
+      "fmin z18.s, p5/M, z18.s, z25.s\n"
+      "fmin z19.s, p5/M, z19.s, z25.s\n"
+      "fmin z20.s, p5/M, z20.s, z25.s\n"
+      "fmin z21.s, p5/M, z21.s, z25.s\n"
+      "fmin z22.s, p5/M, z22.s, z25.s\n"
+      "fmin z23.s, p5/M, z23.s, z25.s\n"
+      "fmax z8.s, p5/M, z8.s, z24.s\n"
+      "fmax z9.s, p5/M, z9.s, z24.s\n"
+      "fmax z10.s, p5/M, z10.s, z24.s\n"
+      "fmax z11.s, p5/M, z11.s, z24.s\n"
+      "fmax z12.s, p5/M, z12.s, z24.s\n"
+      "fmax z13.s, p5/M, z13.s, z24.s\n"
+      "fmax z14.s, p5/M, z14.s, z24.s\n"
+      "fmax z15.s, p5/M, z15.s, z24.s\n"
+      "fmax z16.s, p5/M, z16.s, z24.s\n"
+      "fmax z17.s, p5/M, z17.s, z24.s\n"
+      "fmax z18.s, p5/M, z18.s, z24.s\n"
+      "fmax z19.s, p5/M, z19.s, z24.s\n"
+      "fmax z20.s, p5/M, z20.s, z24.s\n"
+      "fmax z21.s, p5/M, z21.s, z24.s\n"
+      "fmax z22.s, p5/M, z22.s, z24.s\n"
+      "fmax z23.s, p5/M, z23.s, z24.s\n"
       "55:"  // Height 4: No activation
       "st1w { z8.s }, p4, [x13]\n"
       "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
@@ -1363,30 +1363,30 @@
       "60:"  // Height 5: no bias
       "tbz %x[flags], #0, 61f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z8.s }, p4/Z, [x13]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x23, x13, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x24]\n"
-      "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z24.s }, p4/Z, [x22]\n"
-      "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x22]\n"
+      "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x20]\n"
+      "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 62f\n"
       "61:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -1414,15 +1414,15 @@
       "63:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 64f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 65f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -1433,127 +1433,127 @@
       "b 65f\n"
       "64:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "65:"  // Height 5: input setup done
       "cmp x27, #0x4\n"
       "ble 67f\n"
       "66:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "ld1rqw { z3.s }, p0/Z, [x25]\n"
       "sub x27, x27, #0x4\n"
       "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      "ld1rqw { z3.s }, p0/Z, [x23]\n"
+      "ld1rqw { z1.s }, p0/Z, [x23]\n"
       "cmp x27, #0x4\n"
       "add x26, x26, #0x10\n"
-      "ld1rqw { z4.s }, p0/Z, [x22]\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
+      "ld1rqw { z0.s }, p0/Z, [x22]\n"
+      "ld1w { z29.s }, p5/Z, [x12]\n"
+      "fmla z8.s, z29.s, z4.s[0]\n"
+      "fmla z12.s, z29.s, z3.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x11]\n"
+      "fmla z16.s, z29.s, z2.s[0]\n"
+      "fmla z20.s, z29.s, z1.s[0]\n"
       "add x25, x25, #0x10\n"
-      "fmla z24.s, z6.s, z4.s[0]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "fmla z24.s, z29.s, z0.s[0]\n"
+      "fmla z9.s, z28.s, z4.s[0]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
       "add x24, x24, #0x10\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
+      "fmla z13.s, z28.s, z3.s[0]\n"
+      "fmla z17.s, z28.s, z2.s[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "fmla z25.s, z7.s, z4.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z26.s, z6.s, z4.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "fmla z27.s, z7.s, z4.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
-      "fmla z24.s, z6.s, z4.s[1]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "fmla z25.s, z7.s, z4.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z26.s, z6.s, z4.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "fmla z27.s, z7.s, z4.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
-      "fmla z24.s, z6.s, z4.s[2]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "fmla z25.s, z7.s, z4.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z26.s, z6.s, z4.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z21.s, z28.s, z1.s[0]\n"
+      "fmla z25.s, z28.s, z0.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z29.s, z4.s[0]\n"
+      "fmla z14.s, z29.s, z3.s[0]\n"
+      "fmla z18.s, z29.s, z2.s[0]\n"
+      "fmla z22.s, z29.s, z1.s[0]\n"
+      "fmla z26.s, z29.s, z0.s[0]\n"
+      "fmla z11.s, z28.s, z4.s[0]\n"
+      "ld1w { z29.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z15.s, z28.s, z3.s[0]\n"
+      "fmla z19.s, z28.s, z2.s[0]\n"
+      "fmla z23.s, z28.s, z1.s[0]\n"
+      "fmla z27.s, z28.s, z0.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.s, z29.s, z4.s[1]\n"
+      "fmla z12.s, z29.s, z3.s[1]\n"
+      "fmla z16.s, z29.s, z2.s[1]\n"
+      "fmla z20.s, z29.s, z1.s[1]\n"
+      "fmla z24.s, z29.s, z0.s[1]\n"
+      "fmla z9.s, z28.s, z4.s[1]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z13.s, z28.s, z3.s[1]\n"
+      "fmla z17.s, z28.s, z2.s[1]\n"
+      "fmla z21.s, z28.s, z1.s[1]\n"
+      "fmla z25.s, z28.s, z0.s[1]\n"
+      "ld1w { z28.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.s, z29.s, z4.s[1]\n"
+      "fmla z14.s, z29.s, z3.s[1]\n"
+      "fmla z18.s, z29.s, z2.s[1]\n"
+      "fmla z22.s, z29.s, z1.s[1]\n"
+      "fmla z26.s, z29.s, z0.s[1]\n"
+      "fmla z11.s, z28.s, z4.s[1]\n"
+      "ld1w { z29.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z15.s, z28.s, z3.s[1]\n"
+      "fmla z19.s, z28.s, z2.s[1]\n"
+      "fmla z23.s, z28.s, z1.s[1]\n"
+      "fmla z27.s, z28.s, z0.s[1]\n"
+      "ld1w { z28.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.s, z29.s, z4.s[2]\n"
+      "fmla z12.s, z29.s, z3.s[2]\n"
+      "fmla z16.s, z29.s, z2.s[2]\n"
+      "fmla z20.s, z29.s, z1.s[2]\n"
+      "fmla z24.s, z29.s, z0.s[2]\n"
+      "fmla z9.s, z28.s, z4.s[2]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z28.s, z3.s[2]\n"
+      "fmla z17.s, z28.s, z2.s[2]\n"
+      "fmla z21.s, z28.s, z1.s[2]\n"
+      "fmla z25.s, z28.s, z0.s[2]\n"
+      "ld1w { z28.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.s, z29.s, z4.s[2]\n"
+      "fmla z14.s, z29.s, z3.s[2]\n"
+      "fmla z18.s, z29.s, z2.s[2]\n"
+      "fmla z22.s, z29.s, z1.s[2]\n"
+      "fmla z26.s, z29.s, z0.s[2]\n"
+      "fmla z11.s, z28.s, z4.s[2]\n"
+      "ld1w { z29.s }, p5/Z, [x12, #3, MUL VL]\n"
       "addvl x12, x12, #4\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "fmla z27.s, z7.s, z4.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z15.s, z28.s, z3.s[2]\n"
+      "fmla z19.s, z28.s, z2.s[2]\n"
+      "fmla z23.s, z28.s, z1.s[2]\n"
+      "fmla z27.s, z28.s, z0.s[2]\n"
+      "ld1w { z28.s }, p5/Z, [x11, #3, MUL VL]\n"
       "addvl x11, x11, #4\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "fmla z24.s, z6.s, z4.s[3]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z8.s, z29.s, z4.s[3]\n"
+      "fmla z12.s, z29.s, z3.s[3]\n"
+      "fmla z16.s, z29.s, z2.s[3]\n"
+      "fmla z20.s, z29.s, z1.s[3]\n"
+      "fmla z24.s, z29.s, z0.s[3]\n"
+      "fmla z9.s, z28.s, z4.s[3]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "fmla z25.s, z7.s, z4.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z13.s, z28.s, z3.s[3]\n"
+      "fmla z17.s, z28.s, z2.s[3]\n"
+      "fmla z21.s, z28.s, z1.s[3]\n"
+      "fmla z25.s, z28.s, z0.s[3]\n"
+      "ld1w { z28.s }, p5/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z26.s, z6.s, z4.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
-      "fmla z27.s, z7.s, z4.s[3]\n"
+      "fmla z10.s, z29.s, z4.s[3]\n"
+      "fmla z14.s, z29.s, z3.s[3]\n"
+      "fmla z18.s, z29.s, z2.s[3]\n"
+      "fmla z22.s, z29.s, z1.s[3]\n"
+      "fmla z26.s, z29.s, z0.s[3]\n"
+      "fmla z11.s, z28.s, z4.s[3]\n"
+      "fmla z15.s, z28.s, z3.s[3]\n"
+      "fmla z19.s, z28.s, z2.s[3]\n"
+      "fmla z23.s, z28.s, z1.s[3]\n"
+      "fmla z27.s, z28.s, z0.s[3]\n"
       "bgt 66b\n"
       "67:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
@@ -1563,123 +1563,123 @@
       "ld1rqw { z2.s }, p0/Z, [x24]\n"
       "ld1rqw { z3.s }, p0/Z, [x23]\n"
       "ld1rqw { z4.s }, p0/Z, [x22]\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
+      "ld1w { z29.s }, p5/Z, [x12]\n"
+      "fmla z8.s, z29.s, z0.s[0]\n"
+      "fmla z12.s, z29.s, z1.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x11]\n"
+      "fmla z16.s, z29.s, z2.s[0]\n"
+      "fmla z20.s, z29.s, z3.s[0]\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.s, z6.s, z4.s[0]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "fmla z24.s, z29.s, z4.s[0]\n"
+      "fmla z9.s, z28.s, z0.s[0]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
+      "fmla z13.s, z28.s, z1.s[0]\n"
+      "fmla z17.s, z28.s, z2.s[0]\n"
       "addvl x10, x10, #1\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "fmla z25.s, z7.s, z4.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z21.s, z28.s, z3.s[0]\n"
+      "fmla z25.s, z28.s, z4.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z26.s, z6.s, z4.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "fmla z27.s, z7.s, z4.s[0]\n"
+      "fmla z10.s, z29.s, z0.s[0]\n"
+      "fmla z14.s, z29.s, z1.s[0]\n"
+      "fmla z18.s, z29.s, z2.s[0]\n"
+      "fmla z22.s, z29.s, z3.s[0]\n"
+      "fmla z26.s, z29.s, z4.s[0]\n"
+      "fmla z11.s, z28.s, z0.s[0]\n"
+      "fmla z15.s, z28.s, z1.s[0]\n"
+      "fmla z19.s, z28.s, z2.s[0]\n"
+      "fmla z23.s, z28.s, z3.s[0]\n"
+      "fmla z27.s, z28.s, z4.s[0]\n"
       "ble 68f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
+      "ld1w { z29.s }, p5/Z, [x12]\n"
+      "ld1w { z28.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z29.s, z0.s[1]\n"
+      "fmla z12.s, z29.s, z1.s[1]\n"
+      "fmla z16.s, z29.s, z2.s[1]\n"
+      "fmla z20.s, z29.s, z3.s[1]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.s, z6.s, z4.s[1]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "fmla z24.s, z29.s, z4.s[1]\n"
+      "fmla z9.s, z28.s, z0.s[1]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
+      "fmla z13.s, z28.s, z1.s[1]\n"
+      "fmla z17.s, z28.s, z2.s[1]\n"
       "addvl x10, x10, #1\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "fmla z25.s, z7.s, z4.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z21.s, z28.s, z3.s[1]\n"
+      "fmla z25.s, z28.s, z4.s[1]\n"
+      "ld1w { z28.s }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z26.s, z6.s, z4.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "fmla z27.s, z7.s, z4.s[1]\n"
+      "fmla z10.s, z29.s, z0.s[1]\n"
+      "fmla z14.s, z29.s, z1.s[1]\n"
+      "fmla z18.s, z29.s, z2.s[1]\n"
+      "fmla z22.s, z29.s, z3.s[1]\n"
+      "fmla z26.s, z29.s, z4.s[1]\n"
+      "fmla z11.s, z28.s, z0.s[1]\n"
+      "fmla z15.s, z28.s, z1.s[1]\n"
+      "fmla z19.s, z28.s, z2.s[1]\n"
+      "fmla z23.s, z28.s, z3.s[1]\n"
+      "fmla z27.s, z28.s, z4.s[1]\n"
       "ble 68f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
+      "ld1w { z29.s }, p5/Z, [x12]\n"
+      "ld1w { z28.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z29.s, z0.s[2]\n"
+      "fmla z12.s, z29.s, z1.s[2]\n"
+      "fmla z16.s, z29.s, z2.s[2]\n"
+      "fmla z20.s, z29.s, z3.s[2]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.s, z6.s, z4.s[2]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "fmla z24.s, z29.s, z4.s[2]\n"
+      "fmla z9.s, z28.s, z0.s[2]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
+      "fmla z13.s, z28.s, z1.s[2]\n"
+      "fmla z17.s, z28.s, z2.s[2]\n"
       "addvl x10, x10, #1\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "fmla z25.s, z7.s, z4.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z21.s, z28.s, z3.s[2]\n"
+      "fmla z25.s, z28.s, z4.s[2]\n"
+      "ld1w { z28.s }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z26.s, z6.s, z4.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "fmla z27.s, z7.s, z4.s[2]\n"
+      "fmla z10.s, z29.s, z0.s[2]\n"
+      "fmla z14.s, z29.s, z1.s[2]\n"
+      "fmla z18.s, z29.s, z2.s[2]\n"
+      "fmla z22.s, z29.s, z3.s[2]\n"
+      "fmla z26.s, z29.s, z4.s[2]\n"
+      "fmla z11.s, z28.s, z0.s[2]\n"
+      "fmla z15.s, z28.s, z1.s[2]\n"
+      "fmla z19.s, z28.s, z2.s[2]\n"
+      "fmla z23.s, z28.s, z3.s[2]\n"
+      "fmla z27.s, z28.s, z4.s[2]\n"
       "ble 68f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
+      "ld1w { z29.s }, p5/Z, [x12]\n"
+      "ld1w { z28.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z29.s, z0.s[3]\n"
+      "fmla z12.s, z29.s, z1.s[3]\n"
+      "fmla z16.s, z29.s, z2.s[3]\n"
+      "fmla z20.s, z29.s, z3.s[3]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z24.s, z6.s, z4.s[3]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "fmla z24.s, z29.s, z4.s[3]\n"
+      "fmla z9.s, z28.s, z0.s[3]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
       "addvl x10, x10, #1\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "fmla z25.s, z7.s, z4.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z13.s, z28.s, z1.s[3]\n"
+      "fmla z17.s, z28.s, z2.s[3]\n"
+      "fmla z21.s, z28.s, z3.s[3]\n"
+      "fmla z25.s, z28.s, z4.s[3]\n"
+      "ld1w { z28.s }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z26.s, z6.s, z4.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
-      "fmla z27.s, z7.s, z4.s[3]\n"
+      "fmla z10.s, z29.s, z0.s[3]\n"
+      "fmla z14.s, z29.s, z1.s[3]\n"
+      "fmla z18.s, z29.s, z2.s[3]\n"
+      "fmla z22.s, z29.s, z3.s[3]\n"
+      "fmla z26.s, z29.s, z4.s[3]\n"
+      "fmla z11.s, z28.s, z0.s[3]\n"
+      "fmla z15.s, z28.s, z1.s[3]\n"
+      "fmla z19.s, z28.s, z2.s[3]\n"
+      "fmla z23.s, z28.s, z3.s[3]\n"
+      "fmla z27.s, z28.s, z4.s[3]\n"
       "68:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1692,49 +1692,49 @@
       "add x22, x23, x20, LSL #2\n"
       "tbz %x[flags], #1, 69f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z29.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmin z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z1.s\n"
-      "fmin z22.s, p5/M, z22.s, z1.s\n"
-      "fmin z23.s, p5/M, z23.s, z1.s\n"
-      "fmin z24.s, p5/M, z24.s, z1.s\n"
-      "fmin z25.s, p5/M, z25.s, z1.s\n"
-      "fmin z26.s, p5/M, z26.s, z1.s\n"
-      "fmin z27.s, p5/M, z27.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
-      "fmax z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z0.s\n"
-      "fmax z22.s, p5/M, z22.s, z0.s\n"
-      "fmax z23.s, p5/M, z23.s, z0.s\n"
-      "fmax z24.s, p5/M, z24.s, z0.s\n"
-      "fmax z25.s, p5/M, z25.s, z0.s\n"
-      "fmax z26.s, p5/M, z26.s, z0.s\n"
-      "fmax z27.s, p5/M, z27.s, z0.s\n"
+      "ld1rw { z28.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z29.s\n"
+      "fmin z9.s, p5/M, z9.s, z29.s\n"
+      "fmin z10.s, p5/M, z10.s, z29.s\n"
+      "fmin z11.s, p5/M, z11.s, z29.s\n"
+      "fmin z12.s, p5/M, z12.s, z29.s\n"
+      "fmin z13.s, p5/M, z13.s, z29.s\n"
+      "fmin z14.s, p5/M, z14.s, z29.s\n"
+      "fmin z15.s, p5/M, z15.s, z29.s\n"
+      "fmin z16.s, p5/M, z16.s, z29.s\n"
+      "fmin z17.s, p5/M, z17.s, z29.s\n"
+      "fmin z18.s, p5/M, z18.s, z29.s\n"
+      "fmin z19.s, p5/M, z19.s, z29.s\n"
+      "fmin z20.s, p5/M, z20.s, z29.s\n"
+      "fmin z21.s, p5/M, z21.s, z29.s\n"
+      "fmin z22.s, p5/M, z22.s, z29.s\n"
+      "fmin z23.s, p5/M, z23.s, z29.s\n"
+      "fmin z24.s, p5/M, z24.s, z29.s\n"
+      "fmin z25.s, p5/M, z25.s, z29.s\n"
+      "fmin z26.s, p5/M, z26.s, z29.s\n"
+      "fmin z27.s, p5/M, z27.s, z29.s\n"
+      "fmax z8.s, p5/M, z8.s, z28.s\n"
+      "fmax z9.s, p5/M, z9.s, z28.s\n"
+      "fmax z10.s, p5/M, z10.s, z28.s\n"
+      "fmax z11.s, p5/M, z11.s, z28.s\n"
+      "fmax z12.s, p5/M, z12.s, z28.s\n"
+      "fmax z13.s, p5/M, z13.s, z28.s\n"
+      "fmax z14.s, p5/M, z14.s, z28.s\n"
+      "fmax z15.s, p5/M, z15.s, z28.s\n"
+      "fmax z16.s, p5/M, z16.s, z28.s\n"
+      "fmax z17.s, p5/M, z17.s, z28.s\n"
+      "fmax z18.s, p5/M, z18.s, z28.s\n"
+      "fmax z19.s, p5/M, z19.s, z28.s\n"
+      "fmax z20.s, p5/M, z20.s, z28.s\n"
+      "fmax z21.s, p5/M, z21.s, z28.s\n"
+      "fmax z22.s, p5/M, z22.s, z28.s\n"
+      "fmax z23.s, p5/M, z23.s, z28.s\n"
+      "fmax z24.s, p5/M, z24.s, z28.s\n"
+      "fmax z25.s, p5/M, z25.s, z28.s\n"
+      "fmax z26.s, p5/M, z26.s, z28.s\n"
+      "fmax z27.s, p5/M, z27.s, z28.s\n"
       "69:"  // Height 5: No activation
       "st1w { z8.s }, p4, [x13]\n"
       "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
@@ -1830,35 +1830,35 @@
       "74:"  // Height 6: no bias
       "tbz %x[flags], #0, 75f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x13, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "add x24, x13, x20, LSL #2\n"
       "add x23, x24, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
       "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x24]\n"
-      "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z24.s }, p4/Z, [x22]\n"
-      "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p4/Z, [x21]\n"
-      "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x23]\n"
+      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x21]\n"
+      "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 76f\n"
       "75:"  // Height 6: no accumulate
       "mov z8.b, #0x0\n"
@@ -1890,16 +1890,16 @@
       "77:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 78f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 79f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -1911,146 +1911,146 @@
       "b 79f\n"
       "78:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "79:"  // Height 6: input setup done
       "cmp x27, #0x4\n"
       "ble 81f\n"
       "80:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "ld1rqw { z7.s }, p0/Z, [x26]\n"
+      "ld1rqw { z6.s }, p0/Z, [x25]\n"
       "sub x27, x27, #0x4\n"
-      "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      "ld1rqw { z3.s }, p0/Z, [x23]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z4.s }, p0/Z, [x23]\n"
       "cmp x27, #0x4\n"
       "add x26, x26, #0x10\n"
-      "ld1rqw { z4.s }, p0/Z, [x22]\n"
-      "ld1rqw { z5.s }, p0/Z, [x21]\n"
+      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      "ld1rqw { z2.s }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
+      "ld1w { z1.s }, p5/Z, [x12]\n"
+      "ld1w { z0.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z1.s, z7.s[0]\n"
+      "fmla z12.s, z1.s, z6.s[0]\n"
+      "fmla z16.s, z1.s, z5.s[0]\n"
+      "fmla z20.s, z1.s, z4.s[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "fmla z24.s, z6.s, z4.s[0]\n"
-      "fmla z28.s, z6.s, z5.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "fmla z24.s, z1.s, z3.s[0]\n"
+      "fmla z28.s, z1.s, z2.s[0]\n"
+      "ld1w { z1.s }, p5/Z, [x10]\n"
       "add x21, x21, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "fmla z25.s, z7.s, z4.s[0]\n"
-      "fmla z29.s, z7.s, z5.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z26.s, z6.s, z4.s[0]\n"
-      "fmla z30.s, z6.s, z5.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "fmla z27.s, z7.s, z4.s[0]\n"
-      "fmla z31.s, z7.s, z5.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
-      "fmla z24.s, z6.s, z4.s[1]\n"
-      "fmla z28.s, z6.s, z5.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "fmla z25.s, z7.s, z4.s[1]\n"
-      "fmla z29.s, z7.s, z5.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z26.s, z6.s, z4.s[1]\n"
-      "fmla z30.s, z6.s, z5.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "fmla z27.s, z7.s, z4.s[1]\n"
-      "fmla z31.s, z7.s, z5.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
-      "fmla z24.s, z6.s, z4.s[2]\n"
-      "fmla z28.s, z6.s, z5.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "fmla z25.s, z7.s, z4.s[2]\n"
-      "fmla z29.s, z7.s, z5.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z26.s, z6.s, z4.s[2]\n"
-      "fmla z30.s, z6.s, z5.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n"
+      "fmla z9.s, z0.s, z7.s[0]\n"
+      "fmla z13.s, z0.s, z6.s[0]\n"
+      "fmla z17.s, z0.s, z5.s[0]\n"
+      "fmla z21.s, z0.s, z4.s[0]\n"
+      "fmla z25.s, z0.s, z3.s[0]\n"
+      "fmla z29.s, z0.s, z2.s[0]\n"
+      "ld1w { z0.s }, p5/Z, [x9]\n"
+      "fmla z10.s, z1.s, z7.s[0]\n"
+      "fmla z14.s, z1.s, z6.s[0]\n"
+      "fmla z18.s, z1.s, z5.s[0]\n"
+      "fmla z22.s, z1.s, z4.s[0]\n"
+      "fmla z26.s, z1.s, z3.s[0]\n"
+      "fmla z30.s, z1.s, z2.s[0]\n"
+      "ld1w { z1.s }, p5/Z, [x12, #1, MUL VL]\n"
+      "fmla z11.s, z0.s, z7.s[0]\n"
+      "fmla z15.s, z0.s, z6.s[0]\n"
+      "fmla z19.s, z0.s, z5.s[0]\n"
+      "fmla z23.s, z0.s, z4.s[0]\n"
+      "fmla z27.s, z0.s, z3.s[0]\n"
+      "fmla z31.s, z0.s, z2.s[0]\n"
+      "ld1w { z0.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "fmla z8.s, z1.s, z7.s[1]\n"
+      "fmla z12.s, z1.s, z6.s[1]\n"
+      "fmla z16.s, z1.s, z5.s[1]\n"
+      "fmla z20.s, z1.s, z4.s[1]\n"
+      "fmla z24.s, z1.s, z3.s[1]\n"
+      "fmla z28.s, z1.s, z2.s[1]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.s, z0.s, z7.s[1]\n"
+      "fmla z13.s, z0.s, z6.s[1]\n"
+      "fmla z17.s, z0.s, z5.s[1]\n"
+      "fmla z21.s, z0.s, z4.s[1]\n"
+      "fmla z25.s, z0.s, z3.s[1]\n"
+      "fmla z29.s, z0.s, z2.s[1]\n"
+      "ld1w { z0.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "fmla z10.s, z1.s, z7.s[1]\n"
+      "fmla z14.s, z1.s, z6.s[1]\n"
+      "fmla z18.s, z1.s, z5.s[1]\n"
+      "fmla z22.s, z1.s, z4.s[1]\n"
+      "fmla z26.s, z1.s, z3.s[1]\n"
+      "fmla z30.s, z1.s, z2.s[1]\n"
+      "ld1w { z1.s }, p5/Z, [x12, #2, MUL VL]\n"
+      "fmla z11.s, z0.s, z7.s[1]\n"
+      "fmla z15.s, z0.s, z6.s[1]\n"
+      "fmla z19.s, z0.s, z5.s[1]\n"
+      "fmla z23.s, z0.s, z4.s[1]\n"
+      "fmla z27.s, z0.s, z3.s[1]\n"
+      "fmla z31.s, z0.s, z2.s[1]\n"
+      "ld1w { z0.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "fmla z8.s, z1.s, z7.s[2]\n"
+      "fmla z12.s, z1.s, z6.s[2]\n"
+      "fmla z16.s, z1.s, z5.s[2]\n"
+      "fmla z20.s, z1.s, z4.s[2]\n"
+      "fmla z24.s, z1.s, z3.s[2]\n"
+      "fmla z28.s, z1.s, z2.s[2]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z0.s, z7.s[2]\n"
+      "fmla z13.s, z0.s, z6.s[2]\n"
+      "fmla z17.s, z0.s, z5.s[2]\n"
+      "fmla z21.s, z0.s, z4.s[2]\n"
+      "fmla z25.s, z0.s, z3.s[2]\n"
+      "fmla z29.s, z0.s, z2.s[2]\n"
+      "ld1w { z0.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "fmla z10.s, z1.s, z7.s[2]\n"
+      "fmla z14.s, z1.s, z6.s[2]\n"
+      "fmla z18.s, z1.s, z5.s[2]\n"
+      "fmla z22.s, z1.s, z4.s[2]\n"
+      "fmla z26.s, z1.s, z3.s[2]\n"
+      "fmla z30.s, z1.s, z2.s[2]\n"
+      "ld1w { z1.s }, p5/Z, [x12, #3, MUL VL]\n"
       "addvl x12, x12, #4\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "fmla z27.s, z7.s, z4.s[2]\n"
-      "fmla z31.s, z7.s, z5.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "fmla z11.s, z0.s, z7.s[2]\n"
+      "fmla z15.s, z0.s, z6.s[2]\n"
+      "fmla z19.s, z0.s, z5.s[2]\n"
+      "fmla z23.s, z0.s, z4.s[2]\n"
+      "fmla z27.s, z0.s, z3.s[2]\n"
+      "fmla z31.s, z0.s, z2.s[2]\n"
+      "ld1w { z0.s }, p5/Z, [x11, #3, MUL VL]\n"
       "addvl x11, x11, #4\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "fmla z24.s, z6.s, z4.s[3]\n"
-      "fmla z28.s, z6.s, z5.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z8.s, z1.s, z7.s[3]\n"
+      "fmla z12.s, z1.s, z6.s[3]\n"
+      "fmla z16.s, z1.s, z5.s[3]\n"
+      "fmla z20.s, z1.s, z4.s[3]\n"
+      "fmla z24.s, z1.s, z3.s[3]\n"
+      "fmla z28.s, z1.s, z2.s[3]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "fmla z25.s, z7.s, z4.s[3]\n"
-      "fmla z29.s, z7.s, z5.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "fmla z9.s, z0.s, z7.s[3]\n"
+      "fmla z13.s, z0.s, z6.s[3]\n"
+      "fmla z17.s, z0.s, z5.s[3]\n"
+      "fmla z21.s, z0.s, z4.s[3]\n"
+      "fmla z25.s, z0.s, z3.s[3]\n"
+      "fmla z29.s, z0.s, z2.s[3]\n"
+      "ld1w { z0.s }, p5/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z26.s, z6.s, z4.s[3]\n"
-      "fmla z30.s, z6.s, z5.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
-      "fmla z27.s, z7.s, z4.s[3]\n"
-      "fmla z31.s, z7.s, z5.s[3]\n"
+      "fmla z10.s, z1.s, z7.s[3]\n"
+      "fmla z14.s, z1.s, z6.s[3]\n"
+      "fmla z18.s, z1.s, z5.s[3]\n"
+      "fmla z22.s, z1.s, z4.s[3]\n"
+      "fmla z26.s, z1.s, z3.s[3]\n"
+      "fmla z30.s, z1.s, z2.s[3]\n"
+      "fmla z11.s, z0.s, z7.s[3]\n"
+      "fmla z15.s, z0.s, z6.s[3]\n"
+      "fmla z19.s, z0.s, z5.s[3]\n"
+      "fmla z23.s, z0.s, z4.s[3]\n"
+      "fmla z27.s, z0.s, z3.s[3]\n"
+      "fmla z31.s, z0.s, z2.s[3]\n"
       "bgt 80b\n"
       "81:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
@@ -2061,139 +2061,139 @@
       "ld1rqw { z3.s }, p0/Z, [x23]\n"
       "ld1rqw { z4.s }, p0/Z, [x22]\n"
       "ld1rqw { z5.s }, p0/Z, [x21]\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x12]\n"
+      "ld1w { z6.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z7.s, z0.s[0]\n"
+      "fmla z12.s, z7.s, z1.s[0]\n"
+      "fmla z16.s, z7.s, z2.s[0]\n"
+      "fmla z20.s, z7.s, z3.s[0]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z24.s, z6.s, z4.s[0]\n"
-      "fmla z28.s, z6.s, z5.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "fmla z24.s, z7.s, z4.s[0]\n"
+      "fmla z28.s, z7.s, z5.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
       "addvl x10, x10, #1\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "fmla z25.s, z7.s, z4.s[0]\n"
-      "fmla z29.s, z7.s, z5.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z9.s, z6.s, z0.s[0]\n"
+      "fmla z13.s, z6.s, z1.s[0]\n"
+      "fmla z17.s, z6.s, z2.s[0]\n"
+      "fmla z21.s, z6.s, z3.s[0]\n"
+      "fmla z25.s, z6.s, z4.s[0]\n"
+      "fmla z29.s, z6.s, z5.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z26.s, z6.s, z4.s[0]\n"
-      "fmla z30.s, z6.s, z5.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "fmla z27.s, z7.s, z4.s[0]\n"
-      "fmla z31.s, z7.s, z5.s[0]\n"
+      "fmla z10.s, z7.s, z0.s[0]\n"
+      "fmla z14.s, z7.s, z1.s[0]\n"
+      "fmla z18.s, z7.s, z2.s[0]\n"
+      "fmla z22.s, z7.s, z3.s[0]\n"
+      "fmla z26.s, z7.s, z4.s[0]\n"
+      "fmla z30.s, z7.s, z5.s[0]\n"
+      "fmla z11.s, z6.s, z0.s[0]\n"
+      "fmla z15.s, z6.s, z1.s[0]\n"
+      "fmla z19.s, z6.s, z2.s[0]\n"
+      "fmla z23.s, z6.s, z3.s[0]\n"
+      "fmla z27.s, z6.s, z4.s[0]\n"
+      "fmla z31.s, z6.s, z5.s[0]\n"
       "ble 82f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x12]\n"
+      "ld1w { z6.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z7.s, z0.s[1]\n"
+      "fmla z12.s, z7.s, z1.s[1]\n"
+      "fmla z16.s, z7.s, z2.s[1]\n"
+      "fmla z20.s, z7.s, z3.s[1]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.s, z6.s, z4.s[1]\n"
-      "fmla z28.s, z6.s, z5.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "fmla z24.s, z7.s, z4.s[1]\n"
+      "fmla z28.s, z7.s, z5.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z9.s, z6.s, z0.s[1]\n"
+      "fmla z13.s, z6.s, z1.s[1]\n"
       "addvl x10, x10, #1\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "fmla z25.s, z7.s, z4.s[1]\n"
-      "fmla z29.s, z7.s, z5.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z17.s, z6.s, z2.s[1]\n"
+      "fmla z21.s, z6.s, z3.s[1]\n"
+      "fmla z25.s, z6.s, z4.s[1]\n"
+      "fmla z29.s, z6.s, z5.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z26.s, z6.s, z4.s[1]\n"
-      "fmla z30.s, z6.s, z5.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "fmla z27.s, z7.s, z4.s[1]\n"
-      "fmla z31.s, z7.s, z5.s[1]\n"
+      "fmla z10.s, z7.s, z0.s[1]\n"
+      "fmla z14.s, z7.s, z1.s[1]\n"
+      "fmla z18.s, z7.s, z2.s[1]\n"
+      "fmla z22.s, z7.s, z3.s[1]\n"
+      "fmla z26.s, z7.s, z4.s[1]\n"
+      "fmla z30.s, z7.s, z5.s[1]\n"
+      "fmla z11.s, z6.s, z0.s[1]\n"
+      "fmla z15.s, z6.s, z1.s[1]\n"
+      "fmla z19.s, z6.s, z2.s[1]\n"
+      "fmla z23.s, z6.s, z3.s[1]\n"
+      "fmla z27.s, z6.s, z4.s[1]\n"
+      "fmla z31.s, z6.s, z5.s[1]\n"
       "ble 82f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x12]\n"
+      "ld1w { z6.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z7.s, z0.s[2]\n"
+      "fmla z12.s, z7.s, z1.s[2]\n"
+      "fmla z16.s, z7.s, z2.s[2]\n"
+      "fmla z20.s, z7.s, z3.s[2]\n"
       "subs x27, x27, #0x1\n"
       "addvl x12, x12, #1\n"
-      "fmla z24.s, z6.s, z4.s[2]\n"
-      "fmla z28.s, z6.s, z5.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "fmla z24.s, z7.s, z4.s[2]\n"
+      "fmla z28.s, z7.s, z5.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
       "addvl x11, x11, #1\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z9.s, z6.s, z0.s[2]\n"
+      "fmla z13.s, z6.s, z1.s[2]\n"
       "addvl x10, x10, #1\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "fmla z25.s, z7.s, z4.s[2]\n"
-      "fmla z29.s, z7.s, z5.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z17.s, z6.s, z2.s[2]\n"
+      "fmla z21.s, z6.s, z3.s[2]\n"
+      "fmla z25.s, z6.s, z4.s[2]\n"
+      "fmla z29.s, z6.s, z5.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z26.s, z6.s, z4.s[2]\n"
-      "fmla z30.s, z6.s, z5.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "fmla z27.s, z7.s, z4.s[2]\n"
-      "fmla z31.s, z7.s, z5.s[2]\n"
+      "fmla z10.s, z7.s, z0.s[2]\n"
+      "fmla z14.s, z7.s, z1.s[2]\n"
+      "fmla z18.s, z7.s, z2.s[2]\n"
+      "fmla z22.s, z7.s, z3.s[2]\n"
+      "fmla z26.s, z7.s, z4.s[2]\n"
+      "fmla z30.s, z7.s, z5.s[2]\n"
+      "fmla z11.s, z6.s, z0.s[2]\n"
+      "fmla z15.s, z6.s, z1.s[2]\n"
+      "fmla z19.s, z6.s, z2.s[2]\n"
+      "fmla z23.s, z6.s, z3.s[2]\n"
+      "fmla z27.s, z6.s, z4.s[2]\n"
+      "fmla z31.s, z6.s, z5.s[2]\n"
       "ble 82f\n"
-      "ld1w { z6.s }, p5/Z, [x12]\n"
-      "ld1w { z7.s }, p5/Z, [x11]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x12]\n"
+      "ld1w { z6.s }, p5/Z, [x11]\n"
+      "fmla z8.s, z7.s, z0.s[3]\n"
+      "fmla z12.s, z7.s, z1.s[3]\n"
+      "fmla z16.s, z7.s, z2.s[3]\n"
+      "fmla z20.s, z7.s, z3.s[3]\n"
       "addvl x12, x12, #1\n"
       "addvl x11, x11, #1\n"
-      "fmla z24.s, z6.s, z4.s[3]\n"
-      "fmla z28.s, z6.s, z5.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
+      "fmla z24.s, z7.s, z4.s[3]\n"
+      "fmla z28.s, z7.s, z5.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
       "addvl x10, x10, #1\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "fmla z25.s, z7.s, z4.s[3]\n"
-      "fmla z29.s, z7.s, z5.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x9]\n"
+      "fmla z9.s, z6.s, z0.s[3]\n"
+      "fmla z13.s, z6.s, z1.s[3]\n"
+      "fmla z17.s, z6.s, z2.s[3]\n"
+      "fmla z21.s, z6.s, z3.s[3]\n"
+      "fmla z25.s, z6.s, z4.s[3]\n"
+      "fmla z29.s, z6.s, z5.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x9]\n"
       "addvl x9, x9, #1\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z26.s, z6.s, z4.s[3]\n"
-      "fmla z30.s, z6.s, z5.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
-      "fmla z27.s, z7.s, z4.s[3]\n"
-      "fmla z31.s, z7.s, z5.s[3]\n"
+      "fmla z10.s, z7.s, z0.s[3]\n"
+      "fmla z14.s, z7.s, z1.s[3]\n"
+      "fmla z18.s, z7.s, z2.s[3]\n"
+      "fmla z22.s, z7.s, z3.s[3]\n"
+      "fmla z26.s, z7.s, z4.s[3]\n"
+      "fmla z30.s, z7.s, z5.s[3]\n"
+      "fmla z11.s, z6.s, z0.s[3]\n"
+      "fmla z15.s, z6.s, z1.s[3]\n"
+      "fmla z19.s, z6.s, z2.s[3]\n"
+      "fmla z23.s, z6.s, z3.s[3]\n"
+      "fmla z27.s, z6.s, z4.s[3]\n"
+      "fmla z31.s, z6.s, z5.s[3]\n"
       "82:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -2307,4 +2307,4 @@
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp
index 3ee3e31..887d78e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
index 36fc9d7..57f42cc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
@@ -174,22 +174,22 @@
       "b 6f\n"
       "4:"  // Height 1: no bias
       "tbz %x[flags], #0, 5f\n"
-      "ld1w { z9.s }, p6/Z, [x13]\n"
-      "ld1w { z10.s }, p5/Z, [x13, #1, MUL VL]\n"
-      "zip1 z8.d, z9.d, z14.d\n"
-      "zip2 z14.d, z9.d, z14.d\n"
-      "ld1w { z11.s }, p4/Z, [x13, #2, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x13, #3, MUL VL]\n"
-      "zip1 z9.d, z10.d, z15.d\n"
-      "zip2 z15.d, z10.d, z15.d\n"
-      "ld1w { z13.s }, p2/Z, [x13, #4, MUL VL]\n"
+      "ld1w { z21.s }, p6/Z, [x13]\n"
+      "ld1w { z20.s }, p5/Z, [x13, #1, MUL VL]\n"
+      "zip1 z8.d, z21.d, z14.d\n"
+      "zip2 z14.d, z21.d, z14.d\n"
+      "ld1w { z23.s }, p4/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+      "zip1 z9.d, z20.d, z15.d\n"
+      "zip2 z15.d, z20.d, z15.d\n"
+      "ld1w { z21.s }, p2/Z, [x13, #4, MUL VL]\n"
       "ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
-      "zip1 z10.d, z11.d, z16.d\n"
-      "zip2 z16.d, z11.d, z16.d\n"
-      "zip1 z11.d, z12.d, z17.d\n"
-      "zip2 z17.d, z12.d, z17.d\n"
-      "zip1 z12.d, z13.d, z18.d\n"
-      "zip2 z18.d, z13.d, z18.d\n"
+      "zip1 z10.d, z23.d, z16.d\n"
+      "zip2 z16.d, z23.d, z16.d\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "zip1 z12.d, z21.d, z18.d\n"
+      "zip2 z18.d, z21.d, z18.d\n"
       "zip1 z13.d, z20.d, z19.d\n"
       "zip2 z19.d, z20.d, z19.d\n"
       "b 6f\n"
@@ -211,11 +211,11 @@
       "7:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 8f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
       "cbnz x26, 9f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -227,35 +227,35 @@
       "ble 11f\n"
       "10:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x12]\n"
-      "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z6.h }, p7/Z, [x11]\n"
-      "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z4.h }, p7/Z, [x10]\n"
-      "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z6.h }, p7/Z, [x9]\n"
-      "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "ld1rqw { z24.s }, p0/Z, [x24]\n"
+      ".inst 0x658abf18  // bfcvt z24.h, p7/M, z24.s\n"
+      "uzp1 z24.h, z24.h, z24.h\n"
+      "ld1h { z21.h }, p7/Z, [x12]\n"
+      "ld1h { z20.h }, p7/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6475e708  // bfmmla z8.s, z24.h, z21.h\n"
+      ".inst 0x6474e70e  // bfmmla z14.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x11]\n"
+      "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6475e709  // bfmmla z9.s, z24.h, z21.h\n"
+      ".inst 0x6474e70f  // bfmmla z15.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x10]\n"
+      "ld1h { z20.h }, p7/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6475e70a  // bfmmla z10.s, z24.h, z21.h\n"
+      ".inst 0x6474e710  // bfmmla z16.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x9]\n"
+      "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6475e70b  // bfmmla z11.s, z24.h, z21.h\n"
+      ".inst 0x6474e711  // bfmmla z17.s, z24.h, z20.h\n"
+      "ld1h { z23.h }, p7/Z, [x28]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x4\n"
-      "ld1h { z6.h }, p7/Z, [x27]\n"
-      "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      "ld1h { z21.h }, p7/Z, [x27]\n"
+      "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6477e70c  // bfmmla z12.s, z24.h, z23.h\n"
+      ".inst 0x6476e712  // bfmmla z18.s, z24.h, z22.h\n"
+      ".inst 0x6475e70d  // bfmmla z13.s, z24.h, z21.h\n"
+      ".inst 0x6474e713  // bfmmla z19.s, z24.h, z20.h\n"
       "add x24, x24, #0x10\n"
       "addvl x12, x12, #2\n"
       "addvl x11, x11, #2\n"
@@ -266,33 +266,33 @@
       "bgt 10b\n"
       "11:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x12]\n"
-      "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z6.h }, p7/Z, [x11]\n"
-      "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z4.h }, p7/Z, [x10]\n"
-      "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z6.h }, p7/Z, [x9]\n"
-      "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
-      "ld1h { z6.h }, p7/Z, [x27]\n"
-      "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      "ld1rqw { z22.s }, p0/Z, [x24]\n"
+      ".inst 0x658abed6  // bfcvt z22.h, p7/M, z22.s\n"
+      "uzp1 z22.h, z22.h, z22.h\n"
+      "ld1h { z21.h }, p7/Z, [x12]\n"
+      "ld1h { z20.h }, p7/Z, [x12, #1, MUL VL]\n"
+      ".inst 0x6475e6c8  // bfmmla z8.s, z22.h, z21.h\n"
+      ".inst 0x6474e6ce  // bfmmla z14.s, z22.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x11]\n"
+      "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6475e6c9  // bfmmla z9.s, z22.h, z21.h\n"
+      ".inst 0x6474e6cf  // bfmmla z15.s, z22.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x10]\n"
+      "ld1h { z20.h }, p7/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6475e6ca  // bfmmla z10.s, z22.h, z21.h\n"
+      ".inst 0x6474e6d0  // bfmmla z16.s, z22.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x9]\n"
+      "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6475e6cb  // bfmmla z11.s, z22.h, z21.h\n"
+      ".inst 0x6474e6d1  // bfmmla z17.s, z22.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x6475e6cc  // bfmmla z12.s, z22.h, z21.h\n"
+      ".inst 0x6474e6d2  // bfmmla z18.s, z22.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x27]\n"
+      "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6475e6cd  // bfmmla z13.s, z22.h, z21.h\n"
+      ".inst 0x6474e6d3  // bfmmla z19.s, z22.h, z20.h\n"
       "addvl x12, x12, #2\n"
       "addvl x11, x11, #2\n"
       "addvl x10, x10, #2\n"
@@ -312,21 +312,21 @@
       "uzp1 z13.d, z13.d, z19.d\n"
       "tbz %x[flags], #1, 13f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p7/Z, [x20]\n"
+      "ld1rw { z21.s }, p7/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p7/Z, [x20]\n"
-      "fmin z8.s, p7/M, z8.s, z1.s\n"
-      "fmin z9.s, p7/M, z9.s, z1.s\n"
-      "fmin z10.s, p7/M, z10.s, z1.s\n"
-      "fmin z11.s, p7/M, z11.s, z1.s\n"
-      "fmin z12.s, p7/M, z12.s, z1.s\n"
-      "fmin z13.s, p7/M, z13.s, z1.s\n"
-      "fmax z8.s, p7/M, z8.s, z0.s\n"
-      "fmax z9.s, p7/M, z9.s, z0.s\n"
-      "fmax z10.s, p7/M, z10.s, z0.s\n"
-      "fmax z11.s, p7/M, z11.s, z0.s\n"
-      "fmax z12.s, p7/M, z12.s, z0.s\n"
-      "fmax z13.s, p7/M, z13.s, z0.s\n"
+      "ld1rw { z20.s }, p7/Z, [x20]\n"
+      "fmin z8.s, p7/M, z8.s, z21.s\n"
+      "fmin z9.s, p7/M, z9.s, z21.s\n"
+      "fmin z10.s, p7/M, z10.s, z21.s\n"
+      "fmin z11.s, p7/M, z11.s, z21.s\n"
+      "fmin z12.s, p7/M, z12.s, z21.s\n"
+      "fmin z13.s, p7/M, z13.s, z21.s\n"
+      "fmax z8.s, p7/M, z8.s, z20.s\n"
+      "fmax z9.s, p7/M, z9.s, z20.s\n"
+      "fmax z10.s, p7/M, z10.s, z20.s\n"
+      "fmax z11.s, p7/M, z11.s, z20.s\n"
+      "fmax z12.s, p7/M, z12.s, z20.s\n"
+      "fmax z13.s, p7/M, z13.s, z20.s\n"
       "13:"  // Height 1: No activation
       "st1w { z8.s }, p6, [x13]\n"
       "st1w { z9.s }, p5, [x13, #1, MUL VL]\n"
@@ -413,29 +413,29 @@
       "18:"  // Height 2: no bias
       "tbz %x[flags], #0, 19f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x23, x13, x20, LSL #2\n"
-      "ld1w { z9.s }, p6/Z, [x13]\n"
-      "ld1w { z10.s }, p5/Z, [x13, #1, MUL VL]\n"
-      "ld1w { z11.s }, p4/Z, [x13, #2, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z13.s }, p2/Z, [x13, #4, MUL VL]\n"
+      "add x20, x13, x20, LSL #2\n"
+      "ld1w { z16.s }, p6/Z, [x13]\n"
+      "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [x13, #4, MUL VL]\n"
       "ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
-      "ld1w { z14.s }, p6/Z, [x23]\n"
-      "zip1 z8.d, z9.d, z14.d\n"
-      "zip2 z14.d, z9.d, z14.d\n"
-      "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z15.d\n"
-      "zip2 z15.d, z10.d, z15.d\n"
-      "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n"
-      "zip1 z10.d, z11.d, z16.d\n"
-      "zip2 z16.d, z11.d, z16.d\n"
-      "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n"
-      "zip1 z11.d, z12.d, z17.d\n"
-      "zip2 z17.d, z12.d, z17.d\n"
-      "zip1 z12.d, z13.d, z18.d\n"
-      "zip2 z18.d, z13.d, z18.d\n"
+      "ld1w { z14.s }, p6/Z, [x20]\n"
+      "zip1 z8.d, z16.d, z14.d\n"
+      "zip2 z14.d, z16.d, z14.d\n"
+      "ld1w { z15.s }, p5/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x20, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z15.d\n"
+      "zip2 z15.d, z17.d, z15.d\n"
+      "ld1w { z17.s }, p3/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x20, #4, MUL VL]\n"
+      "zip1 z10.d, z19.d, z16.d\n"
+      "zip2 z16.d, z19.d, z16.d\n"
+      "ld1w { z19.s }, p1/Z, [x20, #5, MUL VL]\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "zip1 z12.d, z21.d, z18.d\n"
+      "zip2 z18.d, z21.d, z18.d\n"
       "zip1 z13.d, z20.d, z19.d\n"
       "zip2 z19.d, z20.d, z19.d\n"
       "b 20f\n"
@@ -457,12 +457,12 @@
       "21:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 22f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
       "cbnz x26, 23f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -470,45 +470,45 @@
       "b 23f\n"
       "22:"  // Height 2: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "23:"  // Height 2: input setup done
       "cmp x25, #0x4\n"
       "ble 25f\n"
       "24:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      "ld1rqw { z1.s }, p0/Z, [x23]\n"
-      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x12]\n"
-      "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      "ld1h { z6.h }, p7/Z, [x11]\n"
-      "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x10]\n"
-      "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x9]\n"
-      "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x27]\n"
-      "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
+      "ld1rqw { z24.s }, p0/Z, [x24]\n"
+      "ld1rqw { z20.s }, p0/Z, [x23]\n"
+      ".inst 0x658abf18  // bfcvt z24.h, p7/M, z24.s\n"
+      ".inst 0x658abe94  // bfcvt z20.h, p7/M, z20.s\n"
+      "uzp1 z24.h, z24.h, z24.h\n"
+      "ld1h { z23.h }, p7/Z, [x12]\n"
+      "ld1h { z22.h }, p7/Z, [x12, #1, MUL VL]\n"
+      "uzp1 z20.h, z20.h, z20.h\n"
+      "trn1 z24.d, z24.d, z20.d\n"
+      "ld1h { z21.h }, p7/Z, [x11]\n"
+      "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6477e708  // bfmmla z8.s, z24.h, z23.h\n"
+      ".inst 0x6476e70e  // bfmmla z14.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x10]\n"
+      "ld1h { z22.h }, p7/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6475e709  // bfmmla z9.s, z24.h, z21.h\n"
+      ".inst 0x6474e70f  // bfmmla z15.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x9]\n"
+      "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6477e70a  // bfmmla z10.s, z24.h, z23.h\n"
+      ".inst 0x6476e710  // bfmmla z16.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x28]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x6475e70b  // bfmmla z11.s, z24.h, z21.h\n"
+      ".inst 0x6474e711  // bfmmla z17.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x27]\n"
+      "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x4\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      ".inst 0x6477e70c  // bfmmla z12.s, z24.h, z23.h\n"
+      ".inst 0x6476e712  // bfmmla z18.s, z24.h, z22.h\n"
+      ".inst 0x6475e70d  // bfmmla z13.s, z24.h, z21.h\n"
+      ".inst 0x6474e713  // bfmmla z19.s, z24.h, z20.h\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       "addvl x12, x12, #2\n"
@@ -520,39 +520,39 @@
       "bgt 24b\n"
       "25:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      "ld1rqw { z1.s }, p0/Z, [x23]\n"
-      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x12]\n"
-      "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      "ld1h { z6.h }, p7/Z, [x11]\n"
-      "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x10]\n"
-      "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x9]\n"
-      "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x27]\n"
-      "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
+      "ld1rqw { z24.s }, p0/Z, [x24]\n"
+      "ld1rqw { z20.s }, p0/Z, [x23]\n"
+      ".inst 0x658abf18  // bfcvt z24.h, p7/M, z24.s\n"
+      ".inst 0x658abe94  // bfcvt z20.h, p7/M, z20.s\n"
+      "uzp1 z24.h, z24.h, z24.h\n"
+      "ld1h { z23.h }, p7/Z, [x12]\n"
+      "ld1h { z22.h }, p7/Z, [x12, #1, MUL VL]\n"
+      "uzp1 z20.h, z20.h, z20.h\n"
+      "trn1 z24.d, z24.d, z20.d\n"
+      "ld1h { z21.h }, p7/Z, [x11]\n"
+      "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6477e708  // bfmmla z8.s, z24.h, z23.h\n"
+      ".inst 0x6476e70e  // bfmmla z14.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x10]\n"
+      "ld1h { z22.h }, p7/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6475e709  // bfmmla z9.s, z24.h, z21.h\n"
+      ".inst 0x6474e70f  // bfmmla z15.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x9]\n"
+      "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6477e70a  // bfmmla z10.s, z24.h, z23.h\n"
+      ".inst 0x6476e710  // bfmmla z16.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x28]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x6475e70b  // bfmmla z11.s, z24.h, z21.h\n"
+      ".inst 0x6474e711  // bfmmla z17.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x27]\n"
+      "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6477e70c  // bfmmla z12.s, z24.h, z23.h\n"
+      ".inst 0x6476e712  // bfmmla z18.s, z24.h, z22.h\n"
       "addvl x12, x12, #2\n"
       "addvl x11, x11, #2\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      ".inst 0x6475e70d  // bfmmla z13.s, z24.h, z21.h\n"
+      ".inst 0x6474e713  // bfmmla z19.s, z24.h, z20.h\n"
       "addvl x10, x10, #2\n"
       "addvl x9, x9, #2\n"
       "addvl x28, x28, #2\n"
@@ -578,33 +578,33 @@
       "uzp2 z13.d, z13.d, z19.d\n"
       "tbz %x[flags], #1, 27f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p7/Z, [x20]\n"
+      "ld1rw { z20.s }, p7/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p7/Z, [x20]\n"
-      "fmin z4.s, p7/M, z4.s, z1.s\n"
-      "fmin z14.s, p7/M, z14.s, z1.s\n"
-      "fmin z15.s, p7/M, z15.s, z1.s\n"
-      "fmin z16.s, p7/M, z16.s, z1.s\n"
-      "fmin z17.s, p7/M, z17.s, z1.s\n"
-      "fmin z18.s, p7/M, z18.s, z1.s\n"
-      "fmin z8.s, p7/M, z8.s, z1.s\n"
-      "fmin z9.s, p7/M, z9.s, z1.s\n"
-      "fmin z10.s, p7/M, z10.s, z1.s\n"
-      "fmin z11.s, p7/M, z11.s, z1.s\n"
-      "fmin z12.s, p7/M, z12.s, z1.s\n"
-      "fmin z13.s, p7/M, z13.s, z1.s\n"
-      "fmax z4.s, p7/M, z4.s, z0.s\n"
-      "fmax z14.s, p7/M, z14.s, z0.s\n"
-      "fmax z15.s, p7/M, z15.s, z0.s\n"
-      "fmax z16.s, p7/M, z16.s, z0.s\n"
-      "fmax z17.s, p7/M, z17.s, z0.s\n"
-      "fmax z18.s, p7/M, z18.s, z0.s\n"
-      "fmax z8.s, p7/M, z8.s, z0.s\n"
-      "fmax z9.s, p7/M, z9.s, z0.s\n"
-      "fmax z10.s, p7/M, z10.s, z0.s\n"
-      "fmax z11.s, p7/M, z11.s, z0.s\n"
-      "fmax z12.s, p7/M, z12.s, z0.s\n"
-      "fmax z13.s, p7/M, z13.s, z0.s\n"
+      "ld1rw { z19.s }, p7/Z, [x20]\n"
+      "fmin z4.s, p7/M, z4.s, z20.s\n"
+      "fmin z14.s, p7/M, z14.s, z20.s\n"
+      "fmin z15.s, p7/M, z15.s, z20.s\n"
+      "fmin z16.s, p7/M, z16.s, z20.s\n"
+      "fmin z17.s, p7/M, z17.s, z20.s\n"
+      "fmin z18.s, p7/M, z18.s, z20.s\n"
+      "fmin z8.s, p7/M, z8.s, z20.s\n"
+      "fmin z9.s, p7/M, z9.s, z20.s\n"
+      "fmin z10.s, p7/M, z10.s, z20.s\n"
+      "fmin z11.s, p7/M, z11.s, z20.s\n"
+      "fmin z12.s, p7/M, z12.s, z20.s\n"
+      "fmin z13.s, p7/M, z13.s, z20.s\n"
+      "fmax z4.s, p7/M, z4.s, z19.s\n"
+      "fmax z14.s, p7/M, z14.s, z19.s\n"
+      "fmax z15.s, p7/M, z15.s, z19.s\n"
+      "fmax z16.s, p7/M, z16.s, z19.s\n"
+      "fmax z17.s, p7/M, z17.s, z19.s\n"
+      "fmax z18.s, p7/M, z18.s, z19.s\n"
+      "fmax z8.s, p7/M, z8.s, z19.s\n"
+      "fmax z9.s, p7/M, z9.s, z19.s\n"
+      "fmax z10.s, p7/M, z10.s, z19.s\n"
+      "fmax z11.s, p7/M, z11.s, z19.s\n"
+      "fmax z12.s, p7/M, z12.s, z19.s\n"
+      "fmax z13.s, p7/M, z13.s, z19.s\n"
       "27:"  // Height 2: No activation
       "st1w { z4.s }, p6, [x13]\n"
       "st1w { z14.s }, p5, [x13, #1, MUL VL]\n"
@@ -709,38 +709,38 @@
       "32:"  // Height 3: no bias
       "tbz %x[flags], #0, 33f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x23, x13, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "ld1w { z9.s }, p6/Z, [x13]\n"
-      "ld1w { z10.s }, p5/Z, [x13, #1, MUL VL]\n"
-      "ld1w { z11.s }, p4/Z, [x13, #2, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z13.s }, p2/Z, [x13, #4, MUL VL]\n"
+      "add x21, x13, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z16.s }, p6/Z, [x13]\n"
+      "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [x13, #4, MUL VL]\n"
       "ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
-      "ld1w { z14.s }, p6/Z, [x23]\n"
-      "zip1 z8.d, z9.d, z14.d\n"
-      "zip2 z14.d, z9.d, z14.d\n"
-      "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z15.d\n"
-      "zip2 z15.d, z10.d, z15.d\n"
-      "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n"
-      "zip1 z10.d, z11.d, z16.d\n"
-      "zip2 z16.d, z11.d, z16.d\n"
-      "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n"
-      "ld1w { z21.s }, p6/Z, [x22]\n"
-      "zip1 z11.d, z12.d, z17.d\n"
-      "zip2 z17.d, z12.d, z17.d\n"
-      "ld1w { z22.s }, p5/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z23.s }, p4/Z, [x22, #2, MUL VL]\n"
-      "zip1 z12.d, z13.d, z18.d\n"
-      "zip2 z18.d, z13.d, z18.d\n"
-      "ld1w { z24.s }, p3/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z14.s }, p6/Z, [x21]\n"
+      "zip1 z8.d, z16.d, z14.d\n"
+      "zip2 z14.d, z16.d, z14.d\n"
+      "ld1w { z15.s }, p5/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x21, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z15.d\n"
+      "zip2 z15.d, z17.d, z15.d\n"
+      "ld1w { z17.s }, p3/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x21, #4, MUL VL]\n"
+      "zip1 z10.d, z19.d, z16.d\n"
+      "zip2 z16.d, z19.d, z16.d\n"
+      "ld1w { z19.s }, p1/Z, [x21, #5, MUL VL]\n"
+      "ld1w { z21.s }, p6/Z, [x20]\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "ld1w { z22.s }, p5/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z23.s }, p4/Z, [x20, #2, MUL VL]\n"
+      "zip1 z12.d, z24.d, z18.d\n"
+      "zip2 z18.d, z24.d, z18.d\n"
+      "ld1w { z24.s }, p3/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #4, MUL VL]\n"
       "zip1 z13.d, z20.d, z19.d\n"
       "zip2 z19.d, z20.d, z19.d\n"
-      "ld1w { z4.s }, p1/Z, [x22, #5, MUL VL]\n"
+      "ld1w { z0.s }, p1/Z, [x20, #5, MUL VL]\n"
       "zip1 z20.d, z21.d, z26.d\n"
       "zip2 z26.d, z21.d, z26.d\n"
       "zip1 z21.d, z22.d, z27.d\n"
@@ -751,8 +751,8 @@
       "zip2 z29.d, z24.d, z29.d\n"
       "zip1 z24.d, z25.d, z30.d\n"
       "zip2 z30.d, z25.d, z30.d\n"
-      "zip1 z25.d, z4.d, z31.d\n"
-      "zip2 z31.d, z4.d, z31.d\n"
+      "zip1 z25.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
       "b 34f\n"
       "33:"  // Height 3: no accumulate
       "mov z8.b, #0x0\n"
@@ -784,13 +784,13 @@
       "35:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 36f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
       "cbnz x26, 37f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -799,125 +799,125 @@
       "b 37f\n"
       "36:"  // Height 3: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "37:"  // Height 3: input setup done
       "cmp x25, #0x4\n"
       "ble 39f\n"
       "38:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      "ld1rqw { z1.s }, p0/Z, [x23]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
       ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x22]\n"
-      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z3.h }, p7/Z, [x12]\n"
       "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x12]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      ".inst 0x658abc42  // bfcvt z2.h, p7/M, z2.s\n"
-      "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
-      "ld1h { z6.h }, p7/Z, [x11]\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
-      "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
-      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x10]\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z1.h }, p7/Z, [x11]\n"
+      "trn1 z5.d, z5.d, z0.d\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6463e4a8  // bfmmla z8.s, z5.h, z3.h\n"
+      ".inst 0x6463e494  // bfmmla z20.s, z4.h, z3.h\n"
+      ".inst 0x6462e4ae  // bfmmla z14.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x10]\n"
       "sub x25, x25, #0x4\n"
-      ".inst 0x6465e45a  // bfmmla z26.s, z2.h, z5.h\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      ".inst 0x6461e4a9  // bfmmla z9.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
       "cmp x25, #0x4\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x9]\n"
+      ".inst 0x6461e495  // bfmmla z21.s, z4.h, z1.h\n"
+      ".inst 0x6460e4af  // bfmmla z15.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x9]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6467e45b  // bfmmla z27.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
+      ".inst 0x6460e49b  // bfmmla z27.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6463e4aa  // bfmmla z10.s, z5.h, z3.h\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
+      ".inst 0x6463e496  // bfmmla z22.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b0  // bfmmla z16.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x6465e45c  // bfmmla z28.s, z2.h, z5.h\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x6462e49c  // bfmmla z28.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ab  // bfmmla z11.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
       "addvl x12, x12, #2\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x27]\n"
+      ".inst 0x6461e497  // bfmmla z23.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b1  // bfmmla z17.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x27]\n"
       "addvl x11, x11, #2\n"
-      ".inst 0x6467e45d  // bfmmla z29.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6463e4ac  // bfmmla z12.s, z5.h, z3.h\n"
       "addvl x10, x10, #2\n"
-      ".inst 0x6464e458  // bfmmla z24.s, z2.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
+      ".inst 0x6463e498  // bfmmla z24.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b2  // bfmmla z18.s, z5.h, z2.h\n"
       "addvl x9, x9, #2\n"
       "addvl x28, x28, #2\n"
-      ".inst 0x6465e45e  // bfmmla z30.s, z2.h, z5.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6462e49e  // bfmmla z30.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ad  // bfmmla z13.s, z5.h, z1.h\n"
       "addvl x27, x27, #2\n"
-      ".inst 0x6466e459  // bfmmla z25.s, z2.h, z6.h\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
-      ".inst 0x6467e45f  // bfmmla z31.s, z2.h, z7.h\n"
+      ".inst 0x6461e499  // bfmmla z25.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b3  // bfmmla z19.s, z5.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
       "bgt 38b\n"
       "39:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      "ld1rqw { z1.s }, p0/Z, [x23]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
       ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x22]\n"
-      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z3.h }, p7/Z, [x12]\n"
       "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x12]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      ".inst 0x658abc42  // bfcvt z2.h, p7/M, z2.s\n"
-      "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
-      "ld1h { z6.h }, p7/Z, [x11]\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
-      "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
-      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x10]\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+      "ld1h { z1.h }, p7/Z, [x11]\n"
+      "trn1 z5.d, z5.d, z0.d\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
+      ".inst 0x6463e4a8  // bfmmla z8.s, z5.h, z3.h\n"
+      ".inst 0x6463e494  // bfmmla z20.s, z4.h, z3.h\n"
+      ".inst 0x6462e4ae  // bfmmla z14.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x10]\n"
       "addvl x12, x12, #2\n"
-      ".inst 0x6465e45a  // bfmmla z26.s, z2.h, z5.h\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      ".inst 0x6461e4a9  // bfmmla z9.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
       "addvl x11, x11, #2\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x9]\n"
+      ".inst 0x6461e495  // bfmmla z21.s, z4.h, z1.h\n"
+      ".inst 0x6460e4af  // bfmmla z15.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x9]\n"
       "addvl x10, x10, #2\n"
-      ".inst 0x6467e45b  // bfmmla z27.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
+      ".inst 0x6460e49b  // bfmmla z27.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6463e4aa  // bfmmla z10.s, z5.h, z3.h\n"
       "addvl x9, x9, #2\n"
-      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
-      ".inst 0x6465e45c  // bfmmla z28.s, z2.h, z5.h\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x6463e496  // bfmmla z22.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b0  // bfmmla z16.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
+      ".inst 0x6462e49c  // bfmmla z28.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ab  // bfmmla z11.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
       "addvl x28, x28, #2\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x27]\n"
-      ".inst 0x6467e45d  // bfmmla z29.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6461e497  // bfmmla z23.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b1  // bfmmla z17.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x27]\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6463e4ac  // bfmmla z12.s, z5.h, z3.h\n"
       "addvl x27, x27, #2\n"
-      ".inst 0x6464e458  // bfmmla z24.s, z2.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
-      ".inst 0x6465e45e  // bfmmla z30.s, z2.h, z5.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e459  // bfmmla z25.s, z2.h, z6.h\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
-      ".inst 0x6467e45f  // bfmmla z31.s, z2.h, z7.h\n"
+      ".inst 0x6463e498  // bfmmla z24.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b2  // bfmmla z18.s, z5.h, z2.h\n"
+      ".inst 0x6462e49e  // bfmmla z30.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ad  // bfmmla z13.s, z5.h, z1.h\n"
+      ".inst 0x6461e499  // bfmmla z25.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b3  // bfmmla z19.s, z5.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
       "40:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
@@ -946,45 +946,45 @@
       "uzp1 z25.d, z25.d, z31.d\n"
       "tbz %x[flags], #1, 41f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p7/Z, [x20]\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
       "ld1rw { z0.s }, p7/Z, [x20]\n"
-      "fmin z4.s, p7/M, z4.s, z1.s\n"
-      "fmin z14.s, p7/M, z14.s, z1.s\n"
-      "fmin z15.s, p7/M, z15.s, z1.s\n"
-      "fmin z16.s, p7/M, z16.s, z1.s\n"
-      "fmin z17.s, p7/M, z17.s, z1.s\n"
-      "fmin z18.s, p7/M, z18.s, z1.s\n"
-      "fmin z8.s, p7/M, z8.s, z1.s\n"
-      "fmin z9.s, p7/M, z9.s, z1.s\n"
-      "fmin z10.s, p7/M, z10.s, z1.s\n"
-      "fmin z11.s, p7/M, z11.s, z1.s\n"
-      "fmin z12.s, p7/M, z12.s, z1.s\n"
-      "fmin z13.s, p7/M, z13.s, z1.s\n"
-      "fmin z20.s, p7/M, z20.s, z1.s\n"
-      "fmin z21.s, p7/M, z21.s, z1.s\n"
-      "fmin z22.s, p7/M, z22.s, z1.s\n"
-      "fmin z23.s, p7/M, z23.s, z1.s\n"
-      "fmin z24.s, p7/M, z24.s, z1.s\n"
-      "fmin z25.s, p7/M, z25.s, z1.s\n"
-      "fmax z4.s, p7/M, z4.s, z0.s\n"
-      "fmax z14.s, p7/M, z14.s, z0.s\n"
-      "fmax z15.s, p7/M, z15.s, z0.s\n"
-      "fmax z16.s, p7/M, z16.s, z0.s\n"
-      "fmax z17.s, p7/M, z17.s, z0.s\n"
-      "fmax z18.s, p7/M, z18.s, z0.s\n"
-      "fmax z8.s, p7/M, z8.s, z0.s\n"
-      "fmax z9.s, p7/M, z9.s, z0.s\n"
-      "fmax z10.s, p7/M, z10.s, z0.s\n"
-      "fmax z11.s, p7/M, z11.s, z0.s\n"
-      "fmax z12.s, p7/M, z12.s, z0.s\n"
-      "fmax z13.s, p7/M, z13.s, z0.s\n"
-      "fmax z20.s, p7/M, z20.s, z0.s\n"
-      "fmax z21.s, p7/M, z21.s, z0.s\n"
-      "fmax z22.s, p7/M, z22.s, z0.s\n"
-      "fmax z23.s, p7/M, z23.s, z0.s\n"
-      "fmax z24.s, p7/M, z24.s, z0.s\n"
-      "fmax z25.s, p7/M, z25.s, z0.s\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z19.s }, p7/Z, [x20]\n"
+      "fmin z4.s, p7/M, z4.s, z0.s\n"
+      "fmin z14.s, p7/M, z14.s, z0.s\n"
+      "fmin z15.s, p7/M, z15.s, z0.s\n"
+      "fmin z16.s, p7/M, z16.s, z0.s\n"
+      "fmin z17.s, p7/M, z17.s, z0.s\n"
+      "fmin z18.s, p7/M, z18.s, z0.s\n"
+      "fmin z8.s, p7/M, z8.s, z0.s\n"
+      "fmin z9.s, p7/M, z9.s, z0.s\n"
+      "fmin z10.s, p7/M, z10.s, z0.s\n"
+      "fmin z11.s, p7/M, z11.s, z0.s\n"
+      "fmin z12.s, p7/M, z12.s, z0.s\n"
+      "fmin z13.s, p7/M, z13.s, z0.s\n"
+      "fmin z20.s, p7/M, z20.s, z0.s\n"
+      "fmin z21.s, p7/M, z21.s, z0.s\n"
+      "fmin z22.s, p7/M, z22.s, z0.s\n"
+      "fmin z23.s, p7/M, z23.s, z0.s\n"
+      "fmin z24.s, p7/M, z24.s, z0.s\n"
+      "fmin z25.s, p7/M, z25.s, z0.s\n"
+      "fmax z4.s, p7/M, z4.s, z19.s\n"
+      "fmax z14.s, p7/M, z14.s, z19.s\n"
+      "fmax z15.s, p7/M, z15.s, z19.s\n"
+      "fmax z16.s, p7/M, z16.s, z19.s\n"
+      "fmax z17.s, p7/M, z17.s, z19.s\n"
+      "fmax z18.s, p7/M, z18.s, z19.s\n"
+      "fmax z8.s, p7/M, z8.s, z19.s\n"
+      "fmax z9.s, p7/M, z9.s, z19.s\n"
+      "fmax z10.s, p7/M, z10.s, z19.s\n"
+      "fmax z11.s, p7/M, z11.s, z19.s\n"
+      "fmax z12.s, p7/M, z12.s, z19.s\n"
+      "fmax z13.s, p7/M, z13.s, z19.s\n"
+      "fmax z20.s, p7/M, z20.s, z19.s\n"
+      "fmax z21.s, p7/M, z21.s, z19.s\n"
+      "fmax z22.s, p7/M, z22.s, z19.s\n"
+      "fmax z23.s, p7/M, z23.s, z19.s\n"
+      "fmax z24.s, p7/M, z24.s, z19.s\n"
+      "fmax z25.s, p7/M, z25.s, z19.s\n"
       "41:"  // Height 3: No activation
       "st1w { z4.s }, p6, [x13]\n"
       "st1w { z14.s }, p5, [x13, #1, MUL VL]\n"
@@ -1098,57 +1098,57 @@
       "46:"  // Height 4: no bias
       "tbz %x[flags], #0, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x23, x13, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "ld1w { z9.s }, p6/Z, [x13]\n"
+      "add x22, x13, x20, LSL #2\n"
       "add x21, x22, x20, LSL #2\n"
-      "ld1w { z10.s }, p5/Z, [x13, #1, MUL VL]\n"
-      "ld1w { z11.s }, p4/Z, [x13, #2, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x13, #3, MUL VL]\n"
-      "ld1w { z13.s }, p2/Z, [x13, #4, MUL VL]\n"
+      "ld1w { z16.s }, p6/Z, [x13]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [x13, #4, MUL VL]\n"
       "ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n"
-      "ld1w { z14.s }, p6/Z, [x23]\n"
-      "zip1 z8.d, z9.d, z14.d\n"
-      "zip2 z14.d, z9.d, z14.d\n"
-      "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z15.d\n"
-      "zip2 z15.d, z10.d, z15.d\n"
-      "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n"
-      "zip1 z10.d, z11.d, z16.d\n"
-      "zip2 z16.d, z11.d, z16.d\n"
-      "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n"
-      "ld1w { z21.s }, p6/Z, [x22]\n"
-      "zip1 z11.d, z12.d, z17.d\n"
-      "zip2 z17.d, z12.d, z17.d\n"
-      "ld1w { z22.s }, p5/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z23.s }, p4/Z, [x22, #2, MUL VL]\n"
-      "zip1 z12.d, z13.d, z18.d\n"
-      "zip2 z18.d, z13.d, z18.d\n"
-      "ld1w { z24.s }, p3/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z14.s }, p6/Z, [x22]\n"
+      "zip1 z8.d, z16.d, z14.d\n"
+      "zip2 z14.d, z16.d, z14.d\n"
+      "ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z15.d\n"
+      "zip2 z15.d, z17.d, z15.d\n"
+      "ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "zip1 z10.d, z19.d, z16.d\n"
+      "zip2 z16.d, z19.d, z16.d\n"
+      "ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n"
+      "ld1w { z21.s }, p6/Z, [x21]\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "ld1w { z22.s }, p5/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z23.s }, p4/Z, [x21, #2, MUL VL]\n"
+      "zip1 z12.d, z24.d, z18.d\n"
+      "zip2 z18.d, z24.d, z18.d\n"
+      "ld1w { z24.s }, p3/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x21, #4, MUL VL]\n"
       "zip1 z13.d, z20.d, z19.d\n"
       "zip2 z19.d, z20.d, z19.d\n"
-      "ld1w { z4.s }, p1/Z, [x22, #5, MUL VL]\n"
-      "ld1w { z26.s }, p6/Z, [x21]\n"
+      "ld1w { z0.s }, p1/Z, [x21, #5, MUL VL]\n"
+      "ld1w { z26.s }, p6/Z, [x20]\n"
       "zip1 z20.d, z21.d, z26.d\n"
       "zip2 z26.d, z21.d, z26.d\n"
-      "ld1w { z27.s }, p5/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z28.s }, p4/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p5/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20, #2, MUL VL]\n"
       "zip1 z21.d, z22.d, z27.d\n"
       "zip2 z27.d, z22.d, z27.d\n"
-      "ld1w { z29.s }, p3/Z, [x21, #3, MUL VL]\n"
-      "ld1w { z30.s }, p2/Z, [x21, #4, MUL VL]\n"
+      "ld1w { z29.s }, p3/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #4, MUL VL]\n"
       "zip1 z22.d, z23.d, z28.d\n"
       "zip2 z28.d, z23.d, z28.d\n"
-      "ld1w { z31.s }, p1/Z, [x21, #5, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20, #5, MUL VL]\n"
       "zip1 z23.d, z24.d, z29.d\n"
       "zip2 z29.d, z24.d, z29.d\n"
       "zip1 z24.d, z25.d, z30.d\n"
       "zip2 z30.d, z25.d, z30.d\n"
-      "zip1 z25.d, z4.d, z31.d\n"
-      "zip2 z31.d, z4.d, z31.d\n"
+      "zip1 z25.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
       "b 48f\n"
       "47:"  // Height 4: no accumulate
       "mov z8.b, #0x0\n"
@@ -1180,14 +1180,14 @@
       "49:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 50f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
       "cbnz x26, 51f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -1197,135 +1197,135 @@
       "b 51f\n"
       "50:"  // Height 4: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "51:"  // Height 4: input setup done
       "cmp x25, #0x4\n"
       "ble 53f\n"
       "52:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      "ld1rqw { z1.s }, p0/Z, [x23]\n"
-      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x22]\n"
-      "ld1rqw { z3.s }, p0/Z, [x21]\n"
-      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
-      ".inst 0x658abc42  // bfcvt z2.h, p7/M, z2.s\n"
-      ".inst 0x658abc63  // bfcvt z3.h, p7/M, z3.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x12]\n"
-      "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
-      "ld1h { z6.h }, p7/Z, [x11]\n"
-      "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
-      "uzp1 z3.h, z3.h, z3.h\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      "ld1rqw { z7.s }, p0/Z, [x24]\n"
+      "ld1rqw { z6.s }, p0/Z, [x23]\n"
+      ".inst 0x658abce7  // bfcvt z7.h, p7/M, z7.s\n"
+      "ld1rqw { z5.s }, p0/Z, [x22]\n"
+      "ld1rqw { z4.s }, p0/Z, [x21]\n"
+      ".inst 0x658abcc6  // bfcvt z6.h, p7/M, z6.s\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "uzp1 z7.h, z7.h, z7.h\n"
+      "ld1h { z3.h }, p7/Z, [x12]\n"
+      "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z1.h }, p7/Z, [x11]\n"
+      "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "trn1 z7.d, z7.d, z6.d\n"
+      ".inst 0x6463e4e8  // bfmmla z8.s, z7.h, z3.h\n"
       "sub x25, x25, #0x4\n"
-      "trn1 z2.d, z2.d, z3.d\n"
-      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x10]\n"
-      ".inst 0x6465e45a  // bfmmla z26.s, z2.h, z5.h\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
+      "trn1 z5.d, z5.d, z4.d\n"
+      ".inst 0x6463e4b4  // bfmmla z20.s, z5.h, z3.h\n"
+      ".inst 0x6462e4ee  // bfmmla z14.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x10]\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
       "cmp x25, #0x4\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x9]\n"
+      ".inst 0x6461e4b5  // bfmmla z21.s, z5.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x9]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6467e45b  // bfmmla z27.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
+      ".inst 0x6460e4bb  // bfmmla z27.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6463e4ea  // bfmmla z10.s, z7.h, z3.h\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
+      ".inst 0x6463e4b6  // bfmmla z22.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f0  // bfmmla z16.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x6465e45c  // bfmmla z28.s, z2.h, z5.h\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x6462e4bc  // bfmmla z28.s, z5.h, z2.h\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x27]\n"
+      ".inst 0x6461e4b7  // bfmmla z23.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f1  // bfmmla z17.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x27]\n"
       "addvl x12, x12, #2\n"
-      ".inst 0x6467e45d  // bfmmla z29.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6463e4ec  // bfmmla z12.s, z7.h, z3.h\n"
       "addvl x11, x11, #2\n"
-      ".inst 0x6464e458  // bfmmla z24.s, z2.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
+      ".inst 0x6463e4b8  // bfmmla z24.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f2  // bfmmla z18.s, z7.h, z2.h\n"
       "addvl x10, x10, #2\n"
       "addvl x9, x9, #2\n"
-      ".inst 0x6465e45e  // bfmmla z30.s, z2.h, z5.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6462e4be  // bfmmla z30.s, z5.h, z2.h\n"
+      ".inst 0x6461e4ed  // bfmmla z13.s, z7.h, z1.h\n"
       "addvl x28, x28, #2\n"
       "addvl x27, x27, #2\n"
-      ".inst 0x6466e459  // bfmmla z25.s, z2.h, z6.h\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
-      ".inst 0x6467e45f  // bfmmla z31.s, z2.h, z7.h\n"
+      ".inst 0x6461e4b9  // bfmmla z25.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f3  // bfmmla z19.s, z7.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
       "bgt 52b\n"
       "53:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      "ld1rqw { z1.s }, p0/Z, [x23]\n"
-      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x22]\n"
-      "ld1rqw { z3.s }, p0/Z, [x21]\n"
-      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
-      ".inst 0x658abc42  // bfcvt z2.h, p7/M, z2.s\n"
-      ".inst 0x658abc63  // bfcvt z3.h, p7/M, z3.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x12]\n"
-      "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
-      "ld1h { z6.h }, p7/Z, [x11]\n"
-      "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n"
-      "uzp1 z3.h, z3.h, z3.h\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      "ld1rqw { z7.s }, p0/Z, [x24]\n"
+      "ld1rqw { z6.s }, p0/Z, [x23]\n"
+      ".inst 0x658abce7  // bfcvt z7.h, p7/M, z7.s\n"
+      "ld1rqw { z5.s }, p0/Z, [x22]\n"
+      "ld1rqw { z4.s }, p0/Z, [x21]\n"
+      ".inst 0x658abcc6  // bfcvt z6.h, p7/M, z6.s\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "uzp1 z7.h, z7.h, z7.h\n"
+      "ld1h { z3.h }, p7/Z, [x12]\n"
+      "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z1.h }, p7/Z, [x11]\n"
+      "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "trn1 z7.d, z7.d, z6.d\n"
+      ".inst 0x6463e4e8  // bfmmla z8.s, z7.h, z3.h\n"
       "addvl x12, x12, #2\n"
-      "trn1 z2.d, z2.d, z3.d\n"
-      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x10]\n"
-      ".inst 0x6465e45a  // bfmmla z26.s, z2.h, z5.h\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n"
+      "trn1 z5.d, z5.d, z4.d\n"
+      ".inst 0x6463e4b4  // bfmmla z20.s, z5.h, z3.h\n"
+      ".inst 0x6462e4ee  // bfmmla z14.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x10]\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n"
       "addvl x11, x11, #2\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x9]\n"
+      ".inst 0x6461e4b5  // bfmmla z21.s, z5.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x9]\n"
       "addvl x10, x10, #2\n"
-      ".inst 0x6467e45b  // bfmmla z27.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
+      ".inst 0x6460e4bb  // bfmmla z27.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6463e4ea  // bfmmla z10.s, z7.h, z3.h\n"
       "addvl x9, x9, #2\n"
-      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
-      ".inst 0x6465e45c  // bfmmla z28.s, z2.h, z5.h\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x6463e4b6  // bfmmla z22.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f0  // bfmmla z16.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
+      ".inst 0x6462e4bc  // bfmmla z28.s, z5.h, z2.h\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
       "addvl x28, x28, #2\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x27]\n"
-      ".inst 0x6467e45d  // bfmmla z29.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6461e4b7  // bfmmla z23.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f1  // bfmmla z17.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x27]\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6463e4ec  // bfmmla z12.s, z7.h, z3.h\n"
       "addvl x27, x27, #2\n"
-      ".inst 0x6464e458  // bfmmla z24.s, z2.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
-      ".inst 0x6465e45e  // bfmmla z30.s, z2.h, z5.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e459  // bfmmla z25.s, z2.h, z6.h\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
-      ".inst 0x6467e45f  // bfmmla z31.s, z2.h, z7.h\n"
+      ".inst 0x6463e4b8  // bfmmla z24.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f2  // bfmmla z18.s, z7.h, z2.h\n"
+      ".inst 0x6462e4be  // bfmmla z30.s, z5.h, z2.h\n"
+      ".inst 0x6461e4ed  // bfmmla z13.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b9  // bfmmla z25.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f3  // bfmmla z19.s, z7.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
       "54:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
@@ -1461,4 +1461,4 @@
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp
index 5792a71..d0ef531 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp
index 7649336..576bd47 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp
@@ -53,33 +53,33 @@
     __asm__ __volatile__(
       "ptrue p0.b\n"
       "1:"  // Height loop
-      "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n"
-      "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
-      "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "mov x24, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x25, %x[Apanel]\n"
       "2:"  // Width loop
-      "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
       "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
       "cntw x23, ALL, MUL #2\n"
-      "add x22, x26, x20, LSL #1\n"
+      "add x22, x24, x20, LSL #1\n"
       "add x21, x22, x20, LSL #1\n"
       "add x20, x21, x20, LSL #1\n"
-      "cmp x25, x23\n"
+      "cmp x26, x23\n"
       "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "mov %x[Apanel], x24\n"
+      "mov %x[Apanel], x25\n"
       "bgt 3f\n"
       "decw x23\n"
-      "cmp x25, x23\n"
-      "mov x21, x26\n"
+      "cmp x26, x23\n"
+      "mov x21, x24\n"
       "bgt 3f\n"
-      "mov x22, x26\n"
+      "mov x22, x24\n"
       "3:"  // B setup done
       "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
       "cmp x20, #0x2\n"
       "mov z8.b, #0x0\n"
       "mov z9.b, #0x0\n"
       "mov z10.b, #0x0\n"
-      "ld1h { z4.h }, p0/Z, [x26]\n"
+      "ld1h { z4.h }, p0/Z, [x24]\n"
       "mov z11.b, #0x0\n"
       "mov z12.b, #0x0\n"
       "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
@@ -88,13 +88,13 @@
       "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
       "mov z15.b, #0x0\n"
       "mov z16.b, #0x0\n"
-      "ld1h { z5.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "ld1h { z5.h }, p0/Z, [x24, #1, MUL VL]\n"
       "mov z17.b, #0x0\n"
       "mov z18.b, #0x0\n"
       "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
       "mov z19.b, #0x0\n"
       "mov z20.b, #0x0\n"
-      "addvl x26, x26, #2\n"
+      "addvl x24, x24, #2\n"
       "mov z21.b, #0x0\n"
       "mov z22.b, #0x0\n"
       "add %x[Apanel], %x[Apanel], #0x30\n"
@@ -109,83 +109,83 @@
       "mov z31.b, #0x0\n"
       "blt 5f\n"
       "4:"  // main loop head
-      "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqh { z6.h }, p0/Z, [%x[Apanel]]\n"
       ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
       ".inst 0x6465e40b  // bfmmla z11.s, z0.h, z5.h\n"
       ".inst 0x6464e42e  // bfmmla z14.s, z1.h, z4.h\n"
       ".inst 0x6465e431  // bfmmla z17.s, z1.h, z5.h\n"
-      "ld1h { z6.h }, p0/Z, [x22]\n"
+      "ld1h { z7.h }, p0/Z, [x22]\n"
       ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
       ".inst 0x6465e457  // bfmmla z23.s, z2.h, z5.h\n"
-      "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n"
-      ".inst 0x6464e47a  // bfmmla z26.s, z3.h, z4.h\n"
-      ".inst 0x6465e47d  // bfmmla z29.s, z3.h, z5.h\n"
-      "ld1h { z4.h }, p0/Z, [x21]\n"
-      "ld1h { z5.h }, p0/Z, [x21, #1, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6467e432  // bfmmla z18.s, z1.h, z7.h\n"
+      "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x6464e4da  // bfmmla z26.s, z6.h, z4.h\n"
+      ".inst 0x6465e4dd  // bfmmla z29.s, z6.h, z5.h\n"
+      "ld1h { z5.h }, p0/Z, [x21]\n"
+      "ld1h { z4.h }, p0/Z, [x21, #1, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6463e40c  // bfmmla z12.s, z0.h, z3.h\n"
+      ".inst 0x6467e42f  // bfmmla z15.s, z1.h, z7.h\n"
+      ".inst 0x6463e432  // bfmmla z18.s, z1.h, z3.h\n"
       "sub x20, x20, #0x2\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6467e458  // bfmmla z24.s, z2.h, z7.h\n"
+      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
+      ".inst 0x6463e458  // bfmmla z24.s, z2.h, z3.h\n"
       "cmp x20, #0x2\n"
-      ".inst 0x6466e47b  // bfmmla z27.s, z3.h, z6.h\n"
-      ".inst 0x6467e47e  // bfmmla z30.s, z3.h, z7.h\n"
-      "ld1h { z6.h }, p0/Z, [x26]\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6465e40d  // bfmmla z13.s, z0.h, z5.h\n"
+      ".inst 0x6467e4db  // bfmmla z27.s, z6.h, z7.h\n"
+      ".inst 0x6463e4de  // bfmmla z30.s, z6.h, z3.h\n"
+      "ld1h { z3.h }, p0/Z, [x24]\n"
+      ".inst 0x6465e40a  // bfmmla z10.s, z0.h, z5.h\n"
+      ".inst 0x6464e40d  // bfmmla z13.s, z0.h, z4.h\n"
       "ld1rqh { z0.h }, p0/Z, [%x[Apanel], #16]\n"
-      ".inst 0x6464e430  // bfmmla z16.s, z1.h, z4.h\n"
-      ".inst 0x6465e433  // bfmmla z19.s, z1.h, z5.h\n"
+      ".inst 0x6465e430  // bfmmla z16.s, z1.h, z5.h\n"
+      ".inst 0x6464e433  // bfmmla z19.s, z1.h, z4.h\n"
       "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #32]\n"
-      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
-      ".inst 0x6465e459  // bfmmla z25.s, z2.h, z5.h\n"
-      "ld1h { z7.h }, p0/Z, [x26, #1, MUL VL]\n"
-      ".inst 0x6464e47c  // bfmmla z28.s, z3.h, z4.h\n"
-      ".inst 0x6465e47f  // bfmmla z31.s, z3.h, z5.h\n"
-      "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #48]\n"
-      "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #64]\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      ".inst 0x6465e456  // bfmmla z22.s, z2.h, z5.h\n"
+      ".inst 0x6464e459  // bfmmla z25.s, z2.h, z4.h\n"
+      "ld1h { z7.h }, p0/Z, [x24, #1, MUL VL]\n"
+      ".inst 0x6465e4dc  // bfmmla z28.s, z6.h, z5.h\n"
+      ".inst 0x6464e4df  // bfmmla z31.s, z6.h, z4.h\n"
+      "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #48]\n"
+      "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #64]\n"
+      ".inst 0x6463e408  // bfmmla z8.s, z0.h, z3.h\n"
       ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      ".inst 0x6463e42e  // bfmmla z14.s, z1.h, z3.h\n"
       ".inst 0x6467e431  // bfmmla z17.s, z1.h, z7.h\n"
-      "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
-      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
-      "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n"
-      ".inst 0x6466e47a  // bfmmla z26.s, z3.h, z6.h\n"
-      ".inst 0x6467e47d  // bfmmla z29.s, z3.h, z7.h\n"
-      "ld1h { z6.h }, p0/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n"
+      ".inst 0x6463e4b4  // bfmmla z20.s, z5.h, z3.h\n"
+      ".inst 0x6467e4b7  // bfmmla z23.s, z5.h, z7.h\n"
+      "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x6463e4da  // bfmmla z26.s, z6.h, z3.h\n"
+      ".inst 0x6467e4dd  // bfmmla z29.s, z6.h, z7.h\n"
+      "ld1h { z3.h }, p0/Z, [x21, #2, MUL VL]\n"
       "ld1h { z7.h }, p0/Z, [x21, #3, MUL VL]\n"
-      ".inst 0x6464e409  // bfmmla z9.s, z0.h, z4.h\n"
-      ".inst 0x6465e40c  // bfmmla z12.s, z0.h, z5.h\n"
-      ".inst 0x6464e42f  // bfmmla z15.s, z1.h, z4.h\n"
-      ".inst 0x6465e432  // bfmmla z18.s, z1.h, z5.h\n"
+      ".inst 0x6462e409  // bfmmla z9.s, z0.h, z2.h\n"
+      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6462e42f  // bfmmla z15.s, z1.h, z2.h\n"
+      ".inst 0x6464e432  // bfmmla z18.s, z1.h, z4.h\n"
       "addvl x22, x22, #4\n"
-      ".inst 0x6464e455  // bfmmla z21.s, z2.h, z4.h\n"
-      ".inst 0x6465e458  // bfmmla z24.s, z2.h, z5.h\n"
+      ".inst 0x6462e4b5  // bfmmla z21.s, z5.h, z2.h\n"
+      ".inst 0x6464e4b8  // bfmmla z24.s, z5.h, z4.h\n"
       "addvl x21, x21, #4\n"
-      ".inst 0x6464e47b  // bfmmla z27.s, z3.h, z4.h\n"
-      ".inst 0x6465e47e  // bfmmla z30.s, z3.h, z5.h\n"
-      "ld1h { z4.h }, p0/Z, [x26, #2, MUL VL]\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      ".inst 0x6462e4db  // bfmmla z27.s, z6.h, z2.h\n"
+      ".inst 0x6464e4de  // bfmmla z30.s, z6.h, z4.h\n"
+      "ld1h { z4.h }, p0/Z, [x24, #2, MUL VL]\n"
+      ".inst 0x6463e40a  // bfmmla z10.s, z0.h, z3.h\n"
       ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
       "ld1rqh { z0.h }, p0/Z, [%x[Apanel], #80]\n"
-      ".inst 0x6466e430  // bfmmla z16.s, z1.h, z6.h\n"
+      ".inst 0x6463e430  // bfmmla z16.s, z1.h, z3.h\n"
       ".inst 0x6467e433  // bfmmla z19.s, z1.h, z7.h\n"
       "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #96]\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      ".inst 0x6467e459  // bfmmla z25.s, z2.h, z7.h\n"
-      "ld1h { z5.h }, p0/Z, [x26, #3, MUL VL]\n"
-      ".inst 0x6466e47c  // bfmmla z28.s, z3.h, z6.h\n"
-      ".inst 0x6467e47f  // bfmmla z31.s, z3.h, z7.h\n"
+      ".inst 0x6463e4b6  // bfmmla z22.s, z5.h, z3.h\n"
+      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
+      "ld1h { z5.h }, p0/Z, [x24, #3, MUL VL]\n"
+      ".inst 0x6463e4dc  // bfmmla z28.s, z6.h, z3.h\n"
+      ".inst 0x6467e4df  // bfmmla z31.s, z6.h, z7.h\n"
       "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #112]\n"
       "add %x[Apanel], %x[Apanel], #0x80\n"
-      "addvl x26, x26, #4\n"
+      "addvl x24, x24, #4\n"
       "bge 4b\n"
       "5:"  // main loop skip
-      "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
       ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
       ".inst 0x6465e40b  // bfmmla z11.s, z0.h, z5.h\n"
       ".inst 0x6464e42e  // bfmmla z14.s, z1.h, z4.h\n"
@@ -193,115 +193,115 @@
       "ld1h { z6.h }, p0/Z, [x22]\n"
       ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
       ".inst 0x6465e457  // bfmmla z23.s, z2.h, z5.h\n"
-      "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n"
-      ".inst 0x6464e47a  // bfmmla z26.s, z3.h, z4.h\n"
-      ".inst 0x6465e47d  // bfmmla z29.s, z3.h, z5.h\n"
-      "ld1h { z4.h }, p0/Z, [x21]\n"
-      "ld1h { z5.h }, p0/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x6464e4fa  // bfmmla z26.s, z7.h, z4.h\n"
+      ".inst 0x6465e4fd  // bfmmla z29.s, z7.h, z5.h\n"
+      "ld1h { z5.h }, p0/Z, [x21]\n"
+      "ld1h { z4.h }, p0/Z, [x21, #1, MUL VL]\n"
       ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      ".inst 0x6463e40c  // bfmmla z12.s, z0.h, z3.h\n"
       ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6467e432  // bfmmla z18.s, z1.h, z7.h\n"
+      ".inst 0x6463e432  // bfmmla z18.s, z1.h, z3.h\n"
       "add %x[Apanel], %x[Apanel], #0x10\n"
       ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6467e458  // bfmmla z24.s, z2.h, z7.h\n"
+      ".inst 0x6463e458  // bfmmla z24.s, z2.h, z3.h\n"
       "addvl x22, x22, #2\n"
-      ".inst 0x6466e47b  // bfmmla z27.s, z3.h, z6.h\n"
-      ".inst 0x6467e47e  // bfmmla z30.s, z3.h, z7.h\n"
+      ".inst 0x6466e4fb  // bfmmla z27.s, z7.h, z6.h\n"
+      ".inst 0x6463e4fe  // bfmmla z30.s, z7.h, z3.h\n"
       "addvl x21, x21, #2\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6465e40d  // bfmmla z13.s, z0.h, z5.h\n"
-      ".inst 0x6464e430  // bfmmla z16.s, z1.h, z4.h\n"
-      ".inst 0x6465e433  // bfmmla z19.s, z1.h, z5.h\n"
-      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
-      ".inst 0x6465e459  // bfmmla z25.s, z2.h, z5.h\n"
-      ".inst 0x6464e47c  // bfmmla z28.s, z3.h, z4.h\n"
-      ".inst 0x6465e47f  // bfmmla z31.s, z3.h, z5.h\n"
+      ".inst 0x6465e40a  // bfmmla z10.s, z0.h, z5.h\n"
+      ".inst 0x6464e40d  // bfmmla z13.s, z0.h, z4.h\n"
+      ".inst 0x6465e430  // bfmmla z16.s, z1.h, z5.h\n"
+      ".inst 0x6464e433  // bfmmla z19.s, z1.h, z4.h\n"
+      ".inst 0x6465e456  // bfmmla z22.s, z2.h, z5.h\n"
+      ".inst 0x6464e459  // bfmmla z25.s, z2.h, z4.h\n"
+      ".inst 0x6465e4fc  // bfmmla z28.s, z7.h, z5.h\n"
+      ".inst 0x6464e4ff  // bfmmla z31.s, z7.h, z4.h\n"
       "cbz x20, 6f\n"
-      "ld1h { z6.h }, p0/Z, [x26]\n"
-      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
-      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
-      "ld1h { z7.h }, p0/Z, [x26, #1, MUL VL]\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
-      "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #48]\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      ".inst 0x6467e431  // bfmmla z17.s, z1.h, z7.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
-      "ld1h { z4.h }, p0/Z, [x22]\n"
-      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
-      ".inst 0x6466e47a  // bfmmla z26.s, z3.h, z6.h\n"
-      "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
-      ".inst 0x6467e47d  // bfmmla z29.s, z3.h, z7.h\n"
-      "ld1h { z6.h }, p0/Z, [x21]\n"
-      "ld1h { z7.h }, p0/Z, [x21, #1, MUL VL]\n"
-      ".inst 0x6464e409  // bfmmla z9.s, z0.h, z4.h\n"
-      ".inst 0x6465e40c  // bfmmla z12.s, z0.h, z5.h\n"
+      "ld1h { z1.h }, p0/Z, [x24]\n"
+      "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x6461e4e8  // bfmmla z8.s, z7.h, z1.h\n"
+      "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1h { z0.h }, p0/Z, [x24, #1, MUL VL]\n"
+      ".inst 0x6460e4eb  // bfmmla z11.s, z7.h, z0.h\n"
+      "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqh { z4.h }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x6461e4ce  // bfmmla z14.s, z6.h, z1.h\n"
+      ".inst 0x6460e4d1  // bfmmla z17.s, z6.h, z0.h\n"
+      ".inst 0x6461e4b4  // bfmmla z20.s, z5.h, z1.h\n"
+      "ld1h { z3.h }, p0/Z, [x22]\n"
+      ".inst 0x6460e4b7  // bfmmla z23.s, z5.h, z0.h\n"
+      ".inst 0x6461e49a  // bfmmla z26.s, z4.h, z1.h\n"
+      "ld1h { z2.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z1.h }, p0/Z, [x21]\n"
+      "ld1h { z0.h }, p0/Z, [x21, #1, MUL VL]\n"
+      ".inst 0x6463e4e9  // bfmmla z9.s, z7.h, z3.h\n"
+      ".inst 0x6462e4ec  // bfmmla z12.s, z7.h, z2.h\n"
       "add %x[Apanel], %x[Apanel], #0x40\n"
-      ".inst 0x6464e42f  // bfmmla z15.s, z1.h, z4.h\n"
-      ".inst 0x6465e432  // bfmmla z18.s, z1.h, z5.h\n"
-      ".inst 0x6464e455  // bfmmla z21.s, z2.h, z4.h\n"
-      ".inst 0x6465e458  // bfmmla z24.s, z2.h, z5.h\n"
-      ".inst 0x6464e47b  // bfmmla z27.s, z3.h, z4.h\n"
-      ".inst 0x6465e47e  // bfmmla z30.s, z3.h, z5.h\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
-      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
-      ".inst 0x6466e430  // bfmmla z16.s, z1.h, z6.h\n"
-      ".inst 0x6467e433  // bfmmla z19.s, z1.h, z7.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      ".inst 0x6467e459  // bfmmla z25.s, z2.h, z7.h\n"
-      ".inst 0x6466e47c  // bfmmla z28.s, z3.h, z6.h\n"
-      ".inst 0x6467e47f  // bfmmla z31.s, z3.h, z7.h\n"
+      ".inst 0x6463e4cf  // bfmmla z15.s, z6.h, z3.h\n"
+      ".inst 0x6462e4d2  // bfmmla z18.s, z6.h, z2.h\n"
+      ".inst 0x6463e4b5  // bfmmla z21.s, z5.h, z3.h\n"
+      ".inst 0x6462e4b8  // bfmmla z24.s, z5.h, z2.h\n"
+      ".inst 0x6463e49b  // bfmmla z27.s, z4.h, z3.h\n"
+      ".inst 0x6462e49e  // bfmmla z30.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ea  // bfmmla z10.s, z7.h, z1.h\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6461e4d0  // bfmmla z16.s, z6.h, z1.h\n"
+      ".inst 0x6460e4d3  // bfmmla z19.s, z6.h, z0.h\n"
+      ".inst 0x6461e4b6  // bfmmla z22.s, z5.h, z1.h\n"
+      ".inst 0x6460e4b9  // bfmmla z25.s, z5.h, z0.h\n"
+      ".inst 0x6461e49c  // bfmmla z28.s, z4.h, z1.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
       "6:"  // multiply loop done
-      "decw x25, ALL, MUL #3\n"
-      "uzp1 z4.d, z8.d, z11.d\n"
+      "decw x26, ALL, MUL #3\n"
+      "uzp1 z0.d, z8.d, z11.d\n"
       "uzp2 z8.d, z8.d, z11.d\n"
-      "uzp1 z11.d, z9.d, z12.d\n"
+      "uzp1 z1.d, z9.d, z12.d\n"
       "uzp2 z9.d, z9.d, z12.d\n"
-      "st1w { z4.s }, p0, [%x[Cpanel]]\n"
-      "uzp1 z12.d, z10.d, z13.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+      "uzp1 z0.d, z10.d, z13.d\n"
       "uzp2 z10.d, z10.d, z13.d\n"
-      "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
-      "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
-      "uzp1 z13.d, z14.d, z17.d\n"
+      "st1w { z1.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "uzp1 z2.d, z14.d, z17.d\n"
       "uzp2 z14.d, z14.d, z17.d\n"
       "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
-      "uzp1 z17.d, z15.d, z18.d\n"
-      "cmp x25, XZR\n"
+      "uzp1 z1.d, z15.d, z18.d\n"
+      "cmp x26, XZR\n"
       "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
       "uzp2 z15.d, z15.d, z18.d\n"
-      "uzp1 z18.d, z16.d, z19.d\n"
+      "uzp1 z17.d, z16.d, z19.d\n"
       "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
       "uzp2 z16.d, z16.d, z19.d\n"
-      "uzp1 z19.d, z20.d, z23.d\n"
-      "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "uzp1 z0.d, z20.d, z23.d\n"
+      "st1w { z2.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
       "uzp2 z20.d, z20.d, z23.d\n"
       "uzp1 z23.d, z21.d, z24.d\n"
-      "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
       "addvl %x[Cpanel], %x[Cpanel], #16\n"
       "uzp2 z21.d, z21.d, z24.d\n"
-      "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
-      "uzp1 z24.d, z22.d, z25.d\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "uzp1 z19.d, z22.d, z25.d\n"
       "uzp2 z22.d, z22.d, z25.d\n"
       "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
-      "uzp1 z25.d, z26.d, z29.d\n"
+      "uzp1 z18.d, z26.d, z29.d\n"
       "uzp2 z26.d, z26.d, z29.d\n"
       "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
-      "uzp1 z29.d, z27.d, z30.d\n"
+      "uzp1 z17.d, z27.d, z30.d\n"
       "uzp2 z27.d, z27.d, z30.d\n"
       "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
-      "uzp1 z30.d, z28.d, z31.d\n"
+      "uzp1 z16.d, z28.d, z31.d\n"
       "uzp2 z28.d, z28.d, z31.d\n"
-      "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
       "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
-      "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
       "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
       "st1w { z21.s }, p0, [%x[Cpanel]]\n"
       "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
-      "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
-      "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
-      "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
       "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
       "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
       "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp
index 6d36bf8..60f1b69 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp
index 1d502f5..69ddb21 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp
@@ -52,33 +52,33 @@
     __asm__ __volatile__(
       "ptrue p0.b\n"
       "1:"  // Height loop
-      "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n"
-      "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
-      "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "mov x24, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x25, %x[Apanel]\n"
       "2:"  // Width loop
-      "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
       "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
       "cnth x23, ALL, MUL #2\n"
-      "add x22, x26, x20, LSL #1\n"
+      "add x22, x24, x20, LSL #1\n"
       "add x21, x22, x20, LSL #1\n"
       "add x20, x21, x20, LSL #1\n"
-      "cmp x25, x23\n"
+      "cmp x26, x23\n"
       "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "mov %x[Apanel], x24\n"
+      "mov %x[Apanel], x25\n"
       "bgt 3f\n"
       "dech x23\n"
-      "cmp x25, x23\n"
-      "mov x21, x26\n"
+      "cmp x26, x23\n"
+      "mov x21, x24\n"
       "bgt 3f\n"
-      "mov x22, x26\n"
+      "mov x22, x24\n"
       "3:"  // B setup done
       "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
       "cmp x20, #0x2\n"
       "mov z8.b, #0x0\n"
       "mov z9.b, #0x0\n"
       "mov z10.b, #0x0\n"
-      "ld1h { z0.h }, p0/Z, [x26]\n"
+      "ld1h { z0.h }, p0/Z, [x24]\n"
       "mov z11.b, #0x0\n"
       "mov z12.b, #0x0\n"
       "ld1h { z1.h }, p0/Z, [x22]\n"
@@ -116,12 +116,12 @@
       "fmla z11.h, p0/M, z0.h, z4.h\n"
       "fmla z12.h, p0/M, z1.h, z4.h\n"
       "fmla z13.h, p0/M, z2.h, z4.h\n"
-      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n"
+      "ld1rh { z7.h }, p0/Z, [%x[Apanel], #10]\n"
       "fmla z14.h, p0/M, z0.h, z5.h\n"
       "fmla z15.h, p0/M, z1.h, z5.h\n"
       "cmp x20, #0x2\n"
       "fmla z16.h, p0/M, z2.h, z5.h\n"
-      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #12]\n"
       "fmla z17.h, p0/M, z0.h, z6.h\n"
       "fmla z18.h, p0/M, z1.h, z6.h\n"
       "fmla z19.h, p0/M, z2.h, z6.h\n"
@@ -130,57 +130,57 @@
       "fmla z21.h, p0/M, z1.h, z3.h\n"
       "fmla z22.h, p0/M, z2.h, z3.h\n"
       "ld1rh { z3.h }, p0/Z, [%x[Apanel], #16]\n"
-      "fmla z23.h, p0/M, z0.h, z4.h\n"
-      "fmla z24.h, p0/M, z1.h, z4.h\n"
-      "fmla z25.h, p0/M, z2.h, z4.h\n"
-      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #18]\n"
-      "fmla z26.h, p0/M, z0.h, z5.h\n"
-      "fmla z27.h, p0/M, z1.h, z5.h\n"
-      "fmla z28.h, p0/M, z2.h, z5.h\n"
-      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z23.h, p0/M, z0.h, z7.h\n"
+      "fmla z24.h, p0/M, z1.h, z7.h\n"
+      "fmla z25.h, p0/M, z2.h, z7.h\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #18]\n"
+      "fmla z26.h, p0/M, z0.h, z4.h\n"
+      "fmla z27.h, p0/M, z1.h, z4.h\n"
+      "fmla z28.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #20]\n"
       "fmla z29.h, p0/M, z0.h, z6.h\n"
-      "ld1h { z0.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "ld1h { z7.h }, p0/Z, [x24, #1, MUL VL]\n"
       "fmla z30.h, p0/M, z1.h, z6.h\n"
       "fmla z31.h, p0/M, z2.h, z6.h\n"
-      "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z6.h }, p0/Z, [x22, #1, MUL VL]\n"
       "ld1h { z2.h }, p0/Z, [x21, #1, MUL VL]\n"
-      "fmla z8.h, p0/M, z0.h, z3.h\n"
-      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #22]\n"
-      "fmla z9.h, p0/M, z1.h, z3.h\n"
+      "fmla z8.h, p0/M, z7.h, z3.h\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #22]\n"
+      "fmla z9.h, p0/M, z6.h, z3.h\n"
       "fmla z10.h, p0/M, z2.h, z3.h\n"
-      "fmla z11.h, p0/M, z0.h, z4.h\n"
+      "fmla z11.h, p0/M, z7.h, z5.h\n"
       "ld1rh { z3.h }, p0/Z, [%x[Apanel], #24]\n"
-      "fmla z12.h, p0/M, z1.h, z4.h\n"
-      "fmla z13.h, p0/M, z2.h, z4.h\n"
-      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #26]\n"
-      "fmla z14.h, p0/M, z0.h, z5.h\n"
-      "fmla z15.h, p0/M, z1.h, z5.h\n"
-      "addvl x26, x26, #2\n"
-      "fmla z16.h, p0/M, z2.h, z5.h\n"
-      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #28]\n"
-      "fmla z17.h, p0/M, z0.h, z6.h\n"
-      "fmla z18.h, p0/M, z1.h, z6.h\n"
-      "fmla z19.h, p0/M, z2.h, z6.h\n"
-      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #30]\n"
+      "fmla z12.h, p0/M, z6.h, z5.h\n"
+      "fmla z13.h, p0/M, z2.h, z5.h\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #26]\n"
+      "fmla z14.h, p0/M, z7.h, z4.h\n"
+      "fmla z15.h, p0/M, z6.h, z4.h\n"
+      "addvl x24, x24, #2\n"
+      "fmla z16.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z0.h }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z17.h, p0/M, z7.h, z1.h\n"
+      "fmla z18.h, p0/M, z6.h, z1.h\n"
+      "fmla z19.h, p0/M, z2.h, z1.h\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #30]\n"
       "addvl x22, x22, #2\n"
       "addvl x21, x21, #2\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "fmla z20.h, p0/M, z0.h, z3.h\n"
-      "fmla z21.h, p0/M, z1.h, z3.h\n"
+      "fmla z20.h, p0/M, z7.h, z3.h\n"
+      "fmla z21.h, p0/M, z6.h, z3.h\n"
       "fmla z22.h, p0/M, z2.h, z3.h\n"
       "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
-      "fmla z23.h, p0/M, z0.h, z4.h\n"
-      "fmla z24.h, p0/M, z1.h, z4.h\n"
-      "fmla z25.h, p0/M, z2.h, z4.h\n"
-      "fmla z26.h, p0/M, z0.h, z5.h\n"
+      "fmla z23.h, p0/M, z7.h, z5.h\n"
+      "fmla z24.h, p0/M, z6.h, z5.h\n"
+      "fmla z25.h, p0/M, z2.h, z5.h\n"
+      "fmla z26.h, p0/M, z7.h, z0.h\n"
       "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
-      "fmla z27.h, p0/M, z1.h, z5.h\n"
-      "fmla z28.h, p0/M, z2.h, z5.h\n"
+      "fmla z27.h, p0/M, z6.h, z0.h\n"
+      "fmla z28.h, p0/M, z2.h, z0.h\n"
       "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
-      "fmla z29.h, p0/M, z0.h, z6.h\n"
-      "ld1h { z0.h }, p0/Z, [x26]\n"
-      "fmla z30.h, p0/M, z1.h, z6.h\n"
-      "fmla z31.h, p0/M, z2.h, z6.h\n"
+      "fmla z29.h, p0/M, z7.h, z1.h\n"
+      "ld1h { z0.h }, p0/Z, [x24]\n"
+      "fmla z30.h, p0/M, z6.h, z1.h\n"
+      "fmla z31.h, p0/M, z2.h, z1.h\n"
       "ld1h { z1.h }, p0/Z, [x22]\n"
       "ld1h { z2.h }, p0/Z, [x21]\n"
       "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
@@ -188,9 +188,9 @@
       "5:"  // main loop skip
       "fmla z8.h, p0/M, z0.h, z3.h\n"
       "fmla z9.h, p0/M, z1.h, z3.h\n"
-      "addvl x26, x26, #1\n"
+      "addvl x24, x24, #1\n"
       "fmla z10.h, p0/M, z2.h, z3.h\n"
-      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
+      "ld1rh { z7.h }, p0/Z, [%x[Apanel], #8]\n"
       "fmla z11.h, p0/M, z0.h, z4.h\n"
       "fmla z12.h, p0/M, z1.h, z4.h\n"
       "fmla z13.h, p0/M, z2.h, z4.h\n"
@@ -203,11 +203,11 @@
       "fmla z17.h, p0/M, z0.h, z6.h\n"
       "fmla z18.h, p0/M, z1.h, z6.h\n"
       "fmla z19.h, p0/M, z2.h, z6.h\n"
-      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
-      "fmla z20.h, p0/M, z0.h, z3.h\n"
-      "fmla z21.h, p0/M, z1.h, z3.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #14]\n"
+      "fmla z20.h, p0/M, z0.h, z7.h\n"
+      "fmla z21.h, p0/M, z1.h, z7.h\n"
       "addvl x21, x21, #1\n"
-      "fmla z22.h, p0/M, z2.h, z3.h\n"
+      "fmla z22.h, p0/M, z2.h, z7.h\n"
       "fmla z23.h, p0/M, z0.h, z4.h\n"
       "add %x[Apanel], %x[Apanel], #0x10\n"
       "fmla z24.h, p0/M, z1.h, z4.h\n"
@@ -215,50 +215,50 @@
       "fmla z26.h, p0/M, z0.h, z5.h\n"
       "fmla z27.h, p0/M, z1.h, z5.h\n"
       "fmla z28.h, p0/M, z2.h, z5.h\n"
-      "fmla z29.h, p0/M, z0.h, z6.h\n"
-      "fmla z30.h, p0/M, z1.h, z6.h\n"
-      "fmla z31.h, p0/M, z2.h, z6.h\n"
+      "fmla z29.h, p0/M, z0.h, z3.h\n"
+      "fmla z30.h, p0/M, z1.h, z3.h\n"
+      "fmla z31.h, p0/M, z2.h, z3.h\n"
       "cbz x20, 6f\n"
-      "ld1h { z0.h }, p0/Z, [x26]\n"
-      "ld1h { z1.h }, p0/Z, [x22]\n"
-      "ld1h { z2.h }, p0/Z, [x21]\n"
+      "ld1h { z6.h }, p0/Z, [x24]\n"
+      "ld1h { z5.h }, p0/Z, [x22]\n"
+      "ld1h { z4.h }, p0/Z, [x21]\n"
       "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
-      "fmla z8.h, p0/M, z0.h, z3.h\n"
-      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
-      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
-      "fmla z9.h, p0/M, z1.h, z3.h\n"
-      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
-      "fmla z10.h, p0/M, z2.h, z3.h\n"
-      "fmla z11.h, p0/M, z0.h, z4.h\n"
-      "fmla z12.h, p0/M, z1.h, z4.h\n"
-      "fmla z13.h, p0/M, z2.h, z4.h\n"
+      "fmla z8.h, p0/M, z6.h, z3.h\n"
+      "ld1rh { z2.h }, p0/Z, [%x[Apanel], #2]\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #4]\n"
+      "fmla z9.h, p0/M, z5.h, z3.h\n"
+      "ld1rh { z0.h }, p0/Z, [%x[Apanel], #6]\n"
+      "fmla z10.h, p0/M, z4.h, z3.h\n"
+      "fmla z11.h, p0/M, z6.h, z2.h\n"
+      "fmla z12.h, p0/M, z5.h, z2.h\n"
+      "fmla z13.h, p0/M, z4.h, z2.h\n"
       "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
-      "fmla z14.h, p0/M, z0.h, z5.h\n"
-      "fmla z15.h, p0/M, z1.h, z5.h\n"
-      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n"
-      "fmla z16.h, p0/M, z2.h, z5.h\n"
-      "fmla z17.h, p0/M, z0.h, z6.h\n"
-      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n"
-      "fmla z18.h, p0/M, z1.h, z6.h\n"
-      "fmla z19.h, p0/M, z2.h, z6.h\n"
-      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
-      "fmla z20.h, p0/M, z0.h, z3.h\n"
-      "fmla z21.h, p0/M, z1.h, z3.h\n"
+      "fmla z14.h, p0/M, z6.h, z1.h\n"
+      "fmla z15.h, p0/M, z5.h, z1.h\n"
+      "ld1rh { z2.h }, p0/Z, [%x[Apanel], #10]\n"
+      "fmla z16.h, p0/M, z4.h, z1.h\n"
+      "fmla z17.h, p0/M, z6.h, z0.h\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z18.h, p0/M, z5.h, z0.h\n"
+      "fmla z19.h, p0/M, z4.h, z0.h\n"
+      "ld1rh { z0.h }, p0/Z, [%x[Apanel], #14]\n"
+      "fmla z20.h, p0/M, z6.h, z3.h\n"
+      "fmla z21.h, p0/M, z5.h, z3.h\n"
       "add %x[Apanel], %x[Apanel], #0x10\n"
-      "fmla z22.h, p0/M, z2.h, z3.h\n"
-      "fmla z23.h, p0/M, z0.h, z4.h\n"
-      "fmla z24.h, p0/M, z1.h, z4.h\n"
-      "fmla z25.h, p0/M, z2.h, z4.h\n"
-      "fmla z26.h, p0/M, z0.h, z5.h\n"
-      "fmla z27.h, p0/M, z1.h, z5.h\n"
-      "fmla z28.h, p0/M, z2.h, z5.h\n"
-      "fmla z29.h, p0/M, z0.h, z6.h\n"
-      "fmla z30.h, p0/M, z1.h, z6.h\n"
-      "fmla z31.h, p0/M, z2.h, z6.h\n"
+      "fmla z22.h, p0/M, z4.h, z3.h\n"
+      "fmla z23.h, p0/M, z6.h, z2.h\n"
+      "fmla z24.h, p0/M, z5.h, z2.h\n"
+      "fmla z25.h, p0/M, z4.h, z2.h\n"
+      "fmla z26.h, p0/M, z6.h, z1.h\n"
+      "fmla z27.h, p0/M, z5.h, z1.h\n"
+      "fmla z28.h, p0/M, z4.h, z1.h\n"
+      "fmla z29.h, p0/M, z6.h, z0.h\n"
+      "fmla z30.h, p0/M, z5.h, z0.h\n"
+      "fmla z31.h, p0/M, z4.h, z0.h\n"
       "6:"  // multiply loop done
-      "dech x25, ALL, MUL #3\n"
+      "dech x26, ALL, MUL #3\n"
       "st1h { z8.h }, p0, [%x[Cpanel]]\n"
-      "cmp x25, XZR\n"
+      "cmp x26, XZR\n"
       "st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
       "st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
       "st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
@@ -289,7 +289,7 @@
       "bne 1b\n"
       : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
       : [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr))
-      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp
index de219aa..23503fa 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp
@@ -52,26 +52,26 @@
     __asm__ __volatile__(
       "ptrue p0.b\n"
       "1:"  // Height loop
-      "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n"
-      "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
-      "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "mov x24, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x25, %x[Apanel]\n"
       "2:"  // Width loop
-      "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
       "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
       "cnth x23, ALL, MUL #2\n"
-      "add x22, x26, x20, LSL #1\n"
+      "add x22, x24, x20, LSL #1\n"
       "add x21, x22, x20, LSL #1\n"
       "add x20, x21, x20, LSL #1\n"
-      "cmp x25, x23\n"
+      "cmp x26, x23\n"
       "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "mov %x[Apanel], x24\n"
+      "mov %x[Apanel], x25\n"
       "bgt 3f\n"
       "dech x23\n"
-      "cmp x25, x23\n"
-      "mov x21, x26\n"
+      "cmp x26, x23\n"
+      "mov x21, x24\n"
       "bgt 3f\n"
-      "mov x22, x26\n"
+      "mov x22, x24\n"
       "3:"  // B setup done
       "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
       "cmp x20, #0x2\n"
@@ -81,7 +81,7 @@
       "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
       "mov z11.b, #0x0\n"
       "mov z12.b, #0x0\n"
-      "ld1h { z2.h }, p0/Z, [x26]\n"
+      "ld1h { z2.h }, p0/Z, [x24]\n"
       "mov z13.b, #0x0\n"
       "mov z14.b, #0x0\n"
       "ld1h { z3.h }, p0/Z, [x22]\n"
@@ -107,19 +107,19 @@
       "4:"  // main loop head
       "fmla z8.h, z2.h, z0.h[0]\n"
       "fmla z11.h, z2.h, z0.h[1]\n"
-      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #16]\n"
       "fmla z14.h, z2.h, z0.h[2]\n"
       "fmla z17.h, z2.h, z0.h[3]\n"
-      "ld1h { z5.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "ld1h { z6.h }, p0/Z, [x24, #1, MUL VL]\n"
       "fmla z20.h, z2.h, z0.h[4]\n"
       "fmla z23.h, z2.h, z0.h[5]\n"
-      "ld1h { z6.h }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
       "fmla z26.h, z2.h, z0.h[6]\n"
       "fmla z29.h, z2.h, z0.h[7]\n"
-      "ld1h { z7.h }, p0/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z1.h }, p0/Z, [x21, #1, MUL VL]\n"
       "fmla z9.h, z3.h, z0.h[0]\n"
       "fmla z12.h, z3.h, z0.h[1]\n"
-      "addvl x26, x26, #2\n"
+      "addvl x24, x24, #2\n"
       "fmla z15.h, z3.h, z0.h[2]\n"
       "fmla z18.h, z3.h, z0.h[3]\n"
       "addvl x22, x22, #2\n"
@@ -137,36 +137,36 @@
       "add %x[Apanel], %x[Apanel], #0x20\n"
       "fmla z22.h, z4.h, z0.h[4]\n"
       "fmla z25.h, z4.h, z0.h[5]\n"
-      "ld1h { z2.h }, p0/Z, [x26]\n"
+      "ld1h { z2.h }, p0/Z, [x24]\n"
       "fmla z28.h, z4.h, z0.h[6]\n"
       "fmla z31.h, z4.h, z0.h[7]\n"
       "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
-      "fmla z8.h, z5.h, z1.h[0]\n"
-      "fmla z11.h, z5.h, z1.h[1]\n"
+      "fmla z8.h, z6.h, z7.h[0]\n"
+      "fmla z11.h, z6.h, z7.h[1]\n"
       "ld1h { z3.h }, p0/Z, [x22]\n"
-      "fmla z14.h, z5.h, z1.h[2]\n"
-      "fmla z17.h, z5.h, z1.h[3]\n"
+      "fmla z14.h, z6.h, z7.h[2]\n"
+      "fmla z17.h, z6.h, z7.h[3]\n"
       "ld1h { z4.h }, p0/Z, [x21]\n"
-      "fmla z20.h, z5.h, z1.h[4]\n"
-      "fmla z23.h, z5.h, z1.h[5]\n"
-      "fmla z26.h, z5.h, z1.h[6]\n"
-      "fmla z29.h, z5.h, z1.h[7]\n"
-      "fmla z9.h, z6.h, z1.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z15.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z1.h[3]\n"
-      "fmla z21.h, z6.h, z1.h[4]\n"
-      "fmla z24.h, z6.h, z1.h[5]\n"
-      "fmla z27.h, z6.h, z1.h[6]\n"
-      "fmla z30.h, z6.h, z1.h[7]\n"
-      "fmla z10.h, z7.h, z1.h[0]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z16.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z1.h[3]\n"
-      "fmla z22.h, z7.h, z1.h[4]\n"
-      "fmla z25.h, z7.h, z1.h[5]\n"
-      "fmla z28.h, z7.h, z1.h[6]\n"
-      "fmla z31.h, z7.h, z1.h[7]\n"
+      "fmla z20.h, z6.h, z7.h[4]\n"
+      "fmla z23.h, z6.h, z7.h[5]\n"
+      "fmla z26.h, z6.h, z7.h[6]\n"
+      "fmla z29.h, z6.h, z7.h[7]\n"
+      "fmla z9.h, z5.h, z7.h[0]\n"
+      "fmla z12.h, z5.h, z7.h[1]\n"
+      "fmla z15.h, z5.h, z7.h[2]\n"
+      "fmla z18.h, z5.h, z7.h[3]\n"
+      "fmla z21.h, z5.h, z7.h[4]\n"
+      "fmla z24.h, z5.h, z7.h[5]\n"
+      "fmla z27.h, z5.h, z7.h[6]\n"
+      "fmla z30.h, z5.h, z7.h[7]\n"
+      "fmla z10.h, z1.h, z7.h[0]\n"
+      "fmla z13.h, z1.h, z7.h[1]\n"
+      "fmla z16.h, z1.h, z7.h[2]\n"
+      "fmla z19.h, z1.h, z7.h[3]\n"
+      "fmla z22.h, z1.h, z7.h[4]\n"
+      "fmla z25.h, z1.h, z7.h[5]\n"
+      "fmla z28.h, z1.h, z7.h[6]\n"
+      "fmla z31.h, z1.h, z7.h[7]\n"
       "bge 4b\n"
       "5:"  // main loop skip
       "fmla z8.h, z2.h, z0.h[0]\n"
@@ -174,7 +174,7 @@
       "add %x[Apanel], %x[Apanel], #0x10\n"
       "fmla z14.h, z2.h, z0.h[2]\n"
       "fmla z17.h, z2.h, z0.h[3]\n"
-      "addvl x26, x26, #1\n"
+      "addvl x24, x24, #1\n"
       "fmla z20.h, z2.h, z0.h[4]\n"
       "fmla z23.h, z2.h, z0.h[5]\n"
       "addvl x22, x22, #1\n"
@@ -198,39 +198,39 @@
       "fmla z28.h, z4.h, z0.h[6]\n"
       "fmla z31.h, z4.h, z0.h[7]\n"
       "cbz x20, 6f\n"
-      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
-      "ld1h { z5.h }, p0/Z, [x26]\n"
-      "fmla z8.h, z5.h, z0.h[0]\n"
-      "ld1h { z6.h }, p0/Z, [x22]\n"
-      "ld1h { z7.h }, p0/Z, [x21]\n"
-      "fmla z11.h, z5.h, z0.h[1]\n"
-      "fmla z14.h, z5.h, z0.h[2]\n"
-      "fmla z17.h, z5.h, z0.h[3]\n"
+      "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "ld1h { z2.h }, p0/Z, [x24]\n"
+      "fmla z8.h, z2.h, z3.h[0]\n"
+      "ld1h { z1.h }, p0/Z, [x22]\n"
+      "ld1h { z0.h }, p0/Z, [x21]\n"
+      "fmla z11.h, z2.h, z3.h[1]\n"
+      "fmla z14.h, z2.h, z3.h[2]\n"
+      "fmla z17.h, z2.h, z3.h[3]\n"
       "add %x[Apanel], %x[Apanel], #0x10\n"
-      "fmla z20.h, z5.h, z0.h[4]\n"
-      "fmla z23.h, z5.h, z0.h[5]\n"
-      "fmla z26.h, z5.h, z0.h[6]\n"
-      "fmla z29.h, z5.h, z0.h[7]\n"
-      "fmla z9.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z0.h[1]\n"
-      "fmla z15.h, z6.h, z0.h[2]\n"
-      "fmla z18.h, z6.h, z0.h[3]\n"
-      "fmla z21.h, z6.h, z0.h[4]\n"
-      "fmla z24.h, z6.h, z0.h[5]\n"
-      "fmla z27.h, z6.h, z0.h[6]\n"
-      "fmla z30.h, z6.h, z0.h[7]\n"
-      "fmla z10.h, z7.h, z0.h[0]\n"
-      "fmla z13.h, z7.h, z0.h[1]\n"
-      "fmla z16.h, z7.h, z0.h[2]\n"
-      "fmla z19.h, z7.h, z0.h[3]\n"
-      "fmla z22.h, z7.h, z0.h[4]\n"
-      "fmla z25.h, z7.h, z0.h[5]\n"
-      "fmla z28.h, z7.h, z0.h[6]\n"
-      "fmla z31.h, z7.h, z0.h[7]\n"
+      "fmla z20.h, z2.h, z3.h[4]\n"
+      "fmla z23.h, z2.h, z3.h[5]\n"
+      "fmla z26.h, z2.h, z3.h[6]\n"
+      "fmla z29.h, z2.h, z3.h[7]\n"
+      "fmla z9.h, z1.h, z3.h[0]\n"
+      "fmla z12.h, z1.h, z3.h[1]\n"
+      "fmla z15.h, z1.h, z3.h[2]\n"
+      "fmla z18.h, z1.h, z3.h[3]\n"
+      "fmla z21.h, z1.h, z3.h[4]\n"
+      "fmla z24.h, z1.h, z3.h[5]\n"
+      "fmla z27.h, z1.h, z3.h[6]\n"
+      "fmla z30.h, z1.h, z3.h[7]\n"
+      "fmla z10.h, z0.h, z3.h[0]\n"
+      "fmla z13.h, z0.h, z3.h[1]\n"
+      "fmla z16.h, z0.h, z3.h[2]\n"
+      "fmla z19.h, z0.h, z3.h[3]\n"
+      "fmla z22.h, z0.h, z3.h[4]\n"
+      "fmla z25.h, z0.h, z3.h[5]\n"
+      "fmla z28.h, z0.h, z3.h[6]\n"
+      "fmla z31.h, z0.h, z3.h[7]\n"
       "6:"  // multiply loop done
-      "dech x25, ALL, MUL #3\n"
+      "dech x26, ALL, MUL #3\n"
       "st1h { z8.h }, p0, [%x[Cpanel]]\n"
-      "cmp x25, XZR\n"
+      "cmp x26, XZR\n"
       "st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
       "st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
       "st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp
index aa3507e..ac69869 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp
index 8c8b6b0..c65c3a3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp
@@ -52,33 +52,33 @@
     __asm__ __volatile__(
       "ptrue p0.b\n"
       "1:"  // Height loop
-      "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n"
-      "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
-      "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "mov x24, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x25, %x[Apanel]\n"
       "2:"  // Width loop
-      "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
       "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
       "cntw x23, ALL, MUL #2\n"
-      "add x22, x26, x20, LSL #2\n"
+      "add x22, x24, x20, LSL #2\n"
       "add x21, x22, x20, LSL #2\n"
       "add x20, x21, x20, LSL #2\n"
-      "cmp x25, x23\n"
+      "cmp x26, x23\n"
       "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "mov %x[Apanel], x24\n"
+      "mov %x[Apanel], x25\n"
       "bgt 3f\n"
       "decw x23\n"
-      "cmp x25, x23\n"
-      "mov x21, x26\n"
+      "cmp x26, x23\n"
+      "mov x21, x24\n"
       "bgt 3f\n"
-      "mov x22, x26\n"
+      "mov x22, x24\n"
       "3:"  // B setup done
       "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
       "cmp x20, #0x2\n"
       "mov z8.b, #0x0\n"
       "mov z9.b, #0x0\n"
       "mov z10.b, #0x0\n"
-      "ld1w { z0.s }, p0/Z, [x26]\n"
+      "ld1w { z0.s }, p0/Z, [x24]\n"
       "mov z11.b, #0x0\n"
       "mov z12.b, #0x0\n"
       "ld1w { z1.s }, p0/Z, [x22]\n"
@@ -116,12 +116,12 @@
       "fmla z11.s, p0/M, z0.s, z4.s\n"
       "fmla z12.s, p0/M, z1.s, z4.s\n"
       "fmla z13.s, p0/M, z2.s, z4.s\n"
-      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #20]\n"
       "fmla z14.s, p0/M, z0.s, z5.s\n"
       "fmla z15.s, p0/M, z1.s, z5.s\n"
       "cmp x20, #0x2\n"
       "fmla z16.s, p0/M, z2.s, z5.s\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #24]\n"
       "fmla z17.s, p0/M, z0.s, z6.s\n"
       "fmla z18.s, p0/M, z1.s, z6.s\n"
       "fmla z19.s, p0/M, z2.s, z6.s\n"
@@ -130,57 +130,57 @@
       "fmla z21.s, p0/M, z1.s, z3.s\n"
       "fmla z22.s, p0/M, z2.s, z3.s\n"
       "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
-      "fmla z23.s, p0/M, z0.s, z4.s\n"
-      "fmla z24.s, p0/M, z1.s, z4.s\n"
-      "fmla z25.s, p0/M, z2.s, z4.s\n"
-      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
-      "fmla z26.s, p0/M, z0.s, z5.s\n"
-      "fmla z27.s, p0/M, z1.s, z5.s\n"
-      "fmla z28.s, p0/M, z2.s, z5.s\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n"
+      "fmla z23.s, p0/M, z0.s, z7.s\n"
+      "fmla z24.s, p0/M, z1.s, z7.s\n"
+      "fmla z25.s, p0/M, z2.s, z7.s\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #36]\n"
+      "fmla z26.s, p0/M, z0.s, z4.s\n"
+      "fmla z27.s, p0/M, z1.s, z4.s\n"
+      "fmla z28.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #40]\n"
       "fmla z29.s, p0/M, z0.s, z6.s\n"
-      "ld1w { z0.s }, p0/Z, [x26, #1, MUL VL]\n"
+      "ld1w { z7.s }, p0/Z, [x24, #1, MUL VL]\n"
       "fmla z30.s, p0/M, z1.s, z6.s\n"
       "fmla z31.s, p0/M, z2.s, z6.s\n"
-      "ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z6.s }, p0/Z, [x22, #1, MUL VL]\n"
       "ld1w { z2.s }, p0/Z, [x21, #1, MUL VL]\n"
-      "fmla z8.s, p0/M, z0.s, z3.s\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n"
-      "fmla z9.s, p0/M, z1.s, z3.s\n"
+      "fmla z8.s, p0/M, z7.s, z3.s\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
+      "fmla z9.s, p0/M, z6.s, z3.s\n"
       "fmla z10.s, p0/M, z2.s, z3.s\n"
-      "fmla z11.s, p0/M, z0.s, z4.s\n"
+      "fmla z11.s, p0/M, z7.s, z5.s\n"
       "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
-      "fmla z12.s, p0/M, z1.s, z4.s\n"
-      "fmla z13.s, p0/M, z2.s, z4.s\n"
-      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
-      "fmla z14.s, p0/M, z0.s, z5.s\n"
-      "fmla z15.s, p0/M, z1.s, z5.s\n"
-      "addvl x26, x26, #2\n"
-      "fmla z16.s, p0/M, z2.s, z5.s\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n"
-      "fmla z17.s, p0/M, z0.s, z6.s\n"
-      "fmla z18.s, p0/M, z1.s, z6.s\n"
-      "fmla z19.s, p0/M, z2.s, z6.s\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n"
+      "fmla z12.s, p0/M, z6.s, z5.s\n"
+      "fmla z13.s, p0/M, z2.s, z5.s\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #52]\n"
+      "fmla z14.s, p0/M, z7.s, z4.s\n"
+      "fmla z15.s, p0/M, z6.s, z4.s\n"
+      "addvl x24, x24, #2\n"
+      "fmla z16.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
+      "fmla z17.s, p0/M, z7.s, z1.s\n"
+      "fmla z18.s, p0/M, z6.s, z1.s\n"
+      "fmla z19.s, p0/M, z2.s, z1.s\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
       "addvl x22, x22, #2\n"
       "addvl x21, x21, #2\n"
       "add %x[Apanel], %x[Apanel], #0x40\n"
-      "fmla z20.s, p0/M, z0.s, z3.s\n"
-      "fmla z21.s, p0/M, z1.s, z3.s\n"
+      "fmla z20.s, p0/M, z7.s, z3.s\n"
+      "fmla z21.s, p0/M, z6.s, z3.s\n"
       "fmla z22.s, p0/M, z2.s, z3.s\n"
       "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
-      "fmla z23.s, p0/M, z0.s, z4.s\n"
-      "fmla z24.s, p0/M, z1.s, z4.s\n"
-      "fmla z25.s, p0/M, z2.s, z4.s\n"
-      "fmla z26.s, p0/M, z0.s, z5.s\n"
+      "fmla z23.s, p0/M, z7.s, z5.s\n"
+      "fmla z24.s, p0/M, z6.s, z5.s\n"
+      "fmla z25.s, p0/M, z2.s, z5.s\n"
+      "fmla z26.s, p0/M, z7.s, z0.s\n"
       "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
-      "fmla z27.s, p0/M, z1.s, z5.s\n"
-      "fmla z28.s, p0/M, z2.s, z5.s\n"
+      "fmla z27.s, p0/M, z6.s, z0.s\n"
+      "fmla z28.s, p0/M, z2.s, z0.s\n"
       "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
-      "fmla z29.s, p0/M, z0.s, z6.s\n"
-      "ld1w { z0.s }, p0/Z, [x26]\n"
-      "fmla z30.s, p0/M, z1.s, z6.s\n"
-      "fmla z31.s, p0/M, z2.s, z6.s\n"
+      "fmla z29.s, p0/M, z7.s, z1.s\n"
+      "ld1w { z0.s }, p0/Z, [x24]\n"
+      "fmla z30.s, p0/M, z6.s, z1.s\n"
+      "fmla z31.s, p0/M, z2.s, z1.s\n"
       "ld1w { z1.s }, p0/Z, [x22]\n"
       "ld1w { z2.s }, p0/Z, [x21]\n"
       "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
@@ -188,9 +188,9 @@
       "5:"  // main loop skip
       "fmla z8.s, p0/M, z0.s, z3.s\n"
       "fmla z9.s, p0/M, z1.s, z3.s\n"
-      "addvl x26, x26, #1\n"
+      "addvl x24, x24, #1\n"
       "fmla z10.s, p0/M, z2.s, z3.s\n"
-      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
       "fmla z11.s, p0/M, z0.s, z4.s\n"
       "fmla z12.s, p0/M, z1.s, z4.s\n"
       "fmla z13.s, p0/M, z2.s, z4.s\n"
@@ -203,11 +203,11 @@
       "fmla z17.s, p0/M, z0.s, z6.s\n"
       "fmla z18.s, p0/M, z1.s, z6.s\n"
       "fmla z19.s, p0/M, z2.s, z6.s\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
-      "fmla z20.s, p0/M, z0.s, z3.s\n"
-      "fmla z21.s, p0/M, z1.s, z3.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z20.s, p0/M, z0.s, z7.s\n"
+      "fmla z21.s, p0/M, z1.s, z7.s\n"
       "addvl x21, x21, #1\n"
-      "fmla z22.s, p0/M, z2.s, z3.s\n"
+      "fmla z22.s, p0/M, z2.s, z7.s\n"
       "fmla z23.s, p0/M, z0.s, z4.s\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
       "fmla z24.s, p0/M, z1.s, z4.s\n"
@@ -215,50 +215,50 @@
       "fmla z26.s, p0/M, z0.s, z5.s\n"
       "fmla z27.s, p0/M, z1.s, z5.s\n"
       "fmla z28.s, p0/M, z2.s, z5.s\n"
-      "fmla z29.s, p0/M, z0.s, z6.s\n"
-      "fmla z30.s, p0/M, z1.s, z6.s\n"
-      "fmla z31.s, p0/M, z2.s, z6.s\n"
+      "fmla z29.s, p0/M, z0.s, z3.s\n"
+      "fmla z30.s, p0/M, z1.s, z3.s\n"
+      "fmla z31.s, p0/M, z2.s, z3.s\n"
       "cbz x20, 6f\n"
-      "ld1w { z0.s }, p0/Z, [x26]\n"
-      "ld1w { z1.s }, p0/Z, [x22]\n"
-      "ld1w { z2.s }, p0/Z, [x21]\n"
+      "ld1w { z6.s }, p0/Z, [x24]\n"
+      "ld1w { z5.s }, p0/Z, [x22]\n"
+      "ld1w { z4.s }, p0/Z, [x21]\n"
       "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
-      "fmla z8.s, p0/M, z0.s, z3.s\n"
-      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
-      "fmla z9.s, p0/M, z1.s, z3.s\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
-      "fmla z10.s, p0/M, z2.s, z3.s\n"
-      "fmla z11.s, p0/M, z0.s, z4.s\n"
-      "fmla z12.s, p0/M, z1.s, z4.s\n"
-      "fmla z13.s, p0/M, z2.s, z4.s\n"
+      "fmla z8.s, p0/M, z6.s, z3.s\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z9.s, p0/M, z5.s, z3.s\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z10.s, p0/M, z4.s, z3.s\n"
+      "fmla z11.s, p0/M, z6.s, z2.s\n"
+      "fmla z12.s, p0/M, z5.s, z2.s\n"
+      "fmla z13.s, p0/M, z4.s, z2.s\n"
       "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
-      "fmla z14.s, p0/M, z0.s, z5.s\n"
-      "fmla z15.s, p0/M, z1.s, z5.s\n"
-      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
-      "fmla z16.s, p0/M, z2.s, z5.s\n"
-      "fmla z17.s, p0/M, z0.s, z6.s\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
-      "fmla z18.s, p0/M, z1.s, z6.s\n"
-      "fmla z19.s, p0/M, z2.s, z6.s\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
-      "fmla z20.s, p0/M, z0.s, z3.s\n"
-      "fmla z21.s, p0/M, z1.s, z3.s\n"
+      "fmla z14.s, p0/M, z6.s, z1.s\n"
+      "fmla z15.s, p0/M, z5.s, z1.s\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z16.s, p0/M, z4.s, z1.s\n"
+      "fmla z17.s, p0/M, z6.s, z0.s\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z18.s, p0/M, z5.s, z0.s\n"
+      "fmla z19.s, p0/M, z4.s, z0.s\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z20.s, p0/M, z6.s, z3.s\n"
+      "fmla z21.s, p0/M, z5.s, z3.s\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "fmla z22.s, p0/M, z2.s, z3.s\n"
-      "fmla z23.s, p0/M, z0.s, z4.s\n"
-      "fmla z24.s, p0/M, z1.s, z4.s\n"
-      "fmla z25.s, p0/M, z2.s, z4.s\n"
-      "fmla z26.s, p0/M, z0.s, z5.s\n"
-      "fmla z27.s, p0/M, z1.s, z5.s\n"
-      "fmla z28.s, p0/M, z2.s, z5.s\n"
-      "fmla z29.s, p0/M, z0.s, z6.s\n"
-      "fmla z30.s, p0/M, z1.s, z6.s\n"
-      "fmla z31.s, p0/M, z2.s, z6.s\n"
+      "fmla z22.s, p0/M, z4.s, z3.s\n"
+      "fmla z23.s, p0/M, z6.s, z2.s\n"
+      "fmla z24.s, p0/M, z5.s, z2.s\n"
+      "fmla z25.s, p0/M, z4.s, z2.s\n"
+      "fmla z26.s, p0/M, z6.s, z1.s\n"
+      "fmla z27.s, p0/M, z5.s, z1.s\n"
+      "fmla z28.s, p0/M, z4.s, z1.s\n"
+      "fmla z29.s, p0/M, z6.s, z0.s\n"
+      "fmla z30.s, p0/M, z5.s, z0.s\n"
+      "fmla z31.s, p0/M, z4.s, z0.s\n"
       "6:"  // multiply loop done
-      "decw x25, ALL, MUL #3\n"
+      "decw x26, ALL, MUL #3\n"
       "st1w { z8.s }, p0, [%x[Cpanel]]\n"
-      "cmp x25, XZR\n"
+      "cmp x26, XZR\n"
       "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
       "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
       "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
@@ -289,7 +289,7 @@
       "bne 1b\n"
       : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
       : [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr))
-      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp
index 4a0b31d..4b20be6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp
@@ -52,26 +52,26 @@
     __asm__ __volatile__(
       "ptrue p0.b\n"
       "1:"  // Height loop
-      "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n"
-      "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
-      "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "mov x24, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_N]]\n"
+      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "mov x25, %x[Apanel]\n"
       "2:"  // Width loop
-      "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
       "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
       "cntw x23, ALL, MUL #2\n"
-      "add x22, x26, x20, LSL #2\n"
+      "add x22, x24, x20, LSL #2\n"
       "add x21, x22, x20, LSL #2\n"
       "add x20, x21, x20, LSL #2\n"
-      "cmp x25, x23\n"
+      "cmp x26, x23\n"
       "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
-      "mov %x[Apanel], x24\n"
+      "mov %x[Apanel], x25\n"
       "bgt 3f\n"
       "decw x23\n"
-      "cmp x25, x23\n"
-      "mov x21, x26\n"
+      "cmp x26, x23\n"
+      "mov x21, x24\n"
       "bgt 3f\n"
-      "mov x22, x26\n"
+      "mov x22, x24\n"
       "3:"  // B setup done
       "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
       "cmp x20, #0x2\n"
@@ -84,7 +84,7 @@
       "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
       "mov z13.b, #0x0\n"
       "mov z14.b, #0x0\n"
-      "ld1w { z4.s }, p0/Z, [x26]\n"
+      "ld1w { z4.s }, p0/Z, [x24]\n"
       "mov z15.b, #0x0\n"
       "mov z16.b, #0x0\n"
       "ld1w { z5.s }, p0/Z, [x22]\n"
@@ -108,19 +108,19 @@
       "4:"  // main loop head
       "fmla z8.s, z4.s, z0.s[0]\n"
       "fmla z11.s, z4.s, z0.s[1]\n"
-      "ld1rqw { z2.s }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
       "fmla z14.s, z4.s, z0.s[2]\n"
       "fmla z17.s, z4.s, z0.s[3]\n"
-      "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
+      "ld1rqw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
       "fmla z20.s, z4.s, z1.s[0]\n"
       "fmla z23.s, z4.s, z1.s[1]\n"
       "sub x20, x20, #0x2\n"
       "fmla z26.s, z4.s, z1.s[2]\n"
       "fmla z29.s, z4.s, z1.s[3]\n"
-      "ld1w { z4.s }, p0/Z, [x26, #1, MUL VL]\n"
+      "ld1w { z4.s }, p0/Z, [x24, #1, MUL VL]\n"
       "fmla z9.s, z5.s, z0.s[0]\n"
       "fmla z12.s, z5.s, z0.s[1]\n"
-      "addvl x26, x26, #2\n"
+      "addvl x24, x24, #2\n"
       "fmla z15.s, z5.s, z0.s[2]\n"
       "fmla z18.s, z5.s, z0.s[3]\n"
       "cmp x20, #0x2\n"
@@ -140,35 +140,35 @@
       "fmla z25.s, z6.s, z1.s[1]\n"
       "fmla z28.s, z6.s, z1.s[2]\n"
       "fmla z31.s, z6.s, z1.s[3]\n"
-      "ld1w { z6.s }, p0/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z2.s }, p0/Z, [x21, #1, MUL VL]\n"
       "addvl x21, x21, #2\n"
-      "fmla z8.s, z4.s, z2.s[0]\n"
-      "fmla z11.s, z4.s, z2.s[1]\n"
-      "fmla z14.s, z4.s, z2.s[2]\n"
-      "fmla z17.s, z4.s, z2.s[3]\n"
+      "fmla z8.s, z4.s, z3.s[0]\n"
+      "fmla z11.s, z4.s, z3.s[1]\n"
+      "fmla z14.s, z4.s, z3.s[2]\n"
+      "fmla z17.s, z4.s, z3.s[3]\n"
       "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
-      "fmla z20.s, z4.s, z3.s[0]\n"
-      "fmla z23.s, z4.s, z3.s[1]\n"
-      "fmla z26.s, z4.s, z3.s[2]\n"
-      "fmla z29.s, z4.s, z3.s[3]\n"
-      "ld1w { z4.s }, p0/Z, [x26]\n"
-      "fmla z9.s, z5.s, z2.s[0]\n"
-      "fmla z12.s, z5.s, z2.s[1]\n"
-      "fmla z15.s, z5.s, z2.s[2]\n"
-      "fmla z18.s, z5.s, z2.s[3]\n"
-      "fmla z21.s, z5.s, z3.s[0]\n"
-      "fmla z24.s, z5.s, z3.s[1]\n"
-      "fmla z27.s, z5.s, z3.s[2]\n"
-      "fmla z30.s, z5.s, z3.s[3]\n"
+      "fmla z20.s, z4.s, z7.s[0]\n"
+      "fmla z23.s, z4.s, z7.s[1]\n"
+      "fmla z26.s, z4.s, z7.s[2]\n"
+      "fmla z29.s, z4.s, z7.s[3]\n"
+      "ld1w { z4.s }, p0/Z, [x24]\n"
+      "fmla z9.s, z5.s, z3.s[0]\n"
+      "fmla z12.s, z5.s, z3.s[1]\n"
+      "fmla z15.s, z5.s, z3.s[2]\n"
+      "fmla z18.s, z5.s, z3.s[3]\n"
+      "fmla z21.s, z5.s, z7.s[0]\n"
+      "fmla z24.s, z5.s, z7.s[1]\n"
+      "fmla z27.s, z5.s, z7.s[2]\n"
+      "fmla z30.s, z5.s, z7.s[3]\n"
       "ld1w { z5.s }, p0/Z, [x22]\n"
-      "fmla z10.s, z6.s, z2.s[0]\n"
-      "fmla z13.s, z6.s, z2.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z19.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z25.s, z6.s, z3.s[1]\n"
-      "fmla z28.s, z6.s, z3.s[2]\n"
-      "fmla z31.s, z6.s, z3.s[3]\n"
+      "fmla z10.s, z2.s, z3.s[0]\n"
+      "fmla z13.s, z2.s, z3.s[1]\n"
+      "fmla z16.s, z2.s, z3.s[2]\n"
+      "fmla z19.s, z2.s, z3.s[3]\n"
+      "fmla z22.s, z2.s, z7.s[0]\n"
+      "fmla z25.s, z2.s, z7.s[1]\n"
+      "fmla z28.s, z2.s, z7.s[2]\n"
+      "fmla z31.s, z2.s, z7.s[3]\n"
       "ld1w { z6.s }, p0/Z, [x21]\n"
       "bge 4b\n"
       "5:"  // main loop skip
@@ -177,7 +177,7 @@
       "add %x[Apanel], %x[Apanel], #0x20\n"
       "fmla z14.s, z4.s, z0.s[2]\n"
       "fmla z17.s, z4.s, z0.s[3]\n"
-      "addvl x26, x26, #1\n"
+      "addvl x24, x24, #1\n"
       "fmla z20.s, z4.s, z1.s[0]\n"
       "fmla z23.s, z4.s, z1.s[1]\n"
       "addvl x22, x22, #1\n"
@@ -201,40 +201,40 @@
       "fmla z28.s, z6.s, z1.s[2]\n"
       "fmla z31.s, z6.s, z1.s[3]\n"
       "cbz x20, 6f\n"
-      "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n"
-      "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rqw { z4.s }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "ld1w { z7.s }, p0/Z, [x26]\n"
-      "ld1w { z4.s }, p0/Z, [x22]\n"
-      "fmla z8.s, z7.s, z0.s[0]\n"
-      "ld1w { z5.s }, p0/Z, [x21]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z14.s, z7.s, z0.s[2]\n"
-      "fmla z17.s, z7.s, z0.s[3]\n"
-      "fmla z20.s, z7.s, z1.s[0]\n"
-      "fmla z23.s, z7.s, z1.s[1]\n"
-      "fmla z26.s, z7.s, z1.s[2]\n"
-      "fmla z29.s, z7.s, z1.s[3]\n"
-      "fmla z9.s, z4.s, z0.s[0]\n"
-      "fmla z12.s, z4.s, z0.s[1]\n"
-      "fmla z15.s, z4.s, z0.s[2]\n"
-      "fmla z18.s, z4.s, z0.s[3]\n"
-      "fmla z21.s, z4.s, z1.s[0]\n"
-      "fmla z24.s, z4.s, z1.s[1]\n"
-      "fmla z27.s, z4.s, z1.s[2]\n"
-      "fmla z30.s, z4.s, z1.s[3]\n"
-      "fmla z10.s, z5.s, z0.s[0]\n"
-      "fmla z13.s, z5.s, z0.s[1]\n"
-      "fmla z16.s, z5.s, z0.s[2]\n"
-      "fmla z19.s, z5.s, z0.s[3]\n"
-      "fmla z22.s, z5.s, z1.s[0]\n"
-      "fmla z25.s, z5.s, z1.s[1]\n"
-      "fmla z28.s, z5.s, z1.s[2]\n"
-      "fmla z31.s, z5.s, z1.s[3]\n"
+      "ld1w { z2.s }, p0/Z, [x24]\n"
+      "ld1w { z1.s }, p0/Z, [x22]\n"
+      "fmla z8.s, z2.s, z4.s[0]\n"
+      "ld1w { z0.s }, p0/Z, [x21]\n"
+      "fmla z11.s, z2.s, z4.s[1]\n"
+      "fmla z14.s, z2.s, z4.s[2]\n"
+      "fmla z17.s, z2.s, z4.s[3]\n"
+      "fmla z20.s, z2.s, z3.s[0]\n"
+      "fmla z23.s, z2.s, z3.s[1]\n"
+      "fmla z26.s, z2.s, z3.s[2]\n"
+      "fmla z29.s, z2.s, z3.s[3]\n"
+      "fmla z9.s, z1.s, z4.s[0]\n"
+      "fmla z12.s, z1.s, z4.s[1]\n"
+      "fmla z15.s, z1.s, z4.s[2]\n"
+      "fmla z18.s, z1.s, z4.s[3]\n"
+      "fmla z21.s, z1.s, z3.s[0]\n"
+      "fmla z24.s, z1.s, z3.s[1]\n"
+      "fmla z27.s, z1.s, z3.s[2]\n"
+      "fmla z30.s, z1.s, z3.s[3]\n"
+      "fmla z10.s, z0.s, z4.s[0]\n"
+      "fmla z13.s, z0.s, z4.s[1]\n"
+      "fmla z16.s, z0.s, z4.s[2]\n"
+      "fmla z19.s, z0.s, z4.s[3]\n"
+      "fmla z22.s, z0.s, z3.s[0]\n"
+      "fmla z25.s, z0.s, z3.s[1]\n"
+      "fmla z28.s, z0.s, z3.s[2]\n"
+      "fmla z31.s, z0.s, z3.s[3]\n"
       "6:"  // multiply loop done
-      "decw x25, ALL, MUL #3\n"
+      "decw x26, ALL, MUL #3\n"
       "st1w { z8.s }, p0, [%x[Cpanel]]\n"
-      "cmp x25, XZR\n"
+      "cmp x26, XZR\n"
       "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
       "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
       "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
index 6677c23..49ccce3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../bfloat.hpp"
 #include "../performance_parameters.hpp"
@@ -75,7 +75,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, bfloat16>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -100,5 +99,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
index f0b00e6..176f6e0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
@@ -140,11 +140,11 @@
       "6:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 7f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 8f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -157,87 +157,87 @@
       "9:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10]\n"
+      ".inst 0x64604208  // bfdot z8.s, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64604209  // bfdot z9.s, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460420a  // bfdot z10.s, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6460420b  // bfdot z11.s, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x64684208  // bfdot z8.s, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x64684209  // bfdot z9.s, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6468420a  // bfdot z10.s, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6468420b  // bfdot z11.s, z16.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x64704228  // bfdot z8.s, z17.h, z0.h[2]\n"
+      ".inst 0x64704209  // bfdot z9.s, z16.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6470422a  // bfdot z10.s, z17.h, z0.h[2]\n"
+      ".inst 0x6470420b  // bfdot z11.s, z16.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x64784228  // bfdot z8.s, z17.h, z0.h[3]\n"
+      ".inst 0x64784209  // bfdot z9.s, z16.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
       "sub x27, x27, #0x8\n"
       "cmp x27, #0x8\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x6478422a  // bfdot z10.s, z17.h, z0.h[3]\n"
+      ".inst 0x6478420b  // bfdot z11.s, z16.h, z0.h[3]\n"
       "add x26, x26, #0x10\n"
       "bgt 9b\n"
       "10:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10]\n"
+      ".inst 0x64604208  // bfdot z8.s, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64604209  // bfdot z9.s, z16.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x2\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6460422a  // bfdot z10.s, z17.h, z0.h[0]\n"
+      ".inst 0x6460420b  // bfdot z11.s, z16.h, z0.h[0]\n"
       "addvl x10, x10, #4\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64684228  // bfdot z8.s, z17.h, z0.h[1]\n"
+      ".inst 0x64684209  // bfdot z9.s, z16.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x2\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x6468422a  // bfdot z10.s, z17.h, z0.h[1]\n"
+      ".inst 0x6468420b  // bfdot z11.s, z16.h, z0.h[1]\n"
       "addvl x10, x10, #4\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64704228  // bfdot z8.s, z17.h, z0.h[2]\n"
+      ".inst 0x64704209  // bfdot z9.s, z16.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x2\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x6470422a  // bfdot z10.s, z17.h, z0.h[2]\n"
+      ".inst 0x6470420b  // bfdot z11.s, z16.h, z0.h[2]\n"
       "addvl x10, x10, #4\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64784228  // bfdot z8.s, z17.h, z0.h[3]\n"
+      ".inst 0x64784209  // bfdot z9.s, z16.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6478422a  // bfdot z10.s, z17.h, z0.h[3]\n"
+      ".inst 0x6478420b  // bfdot z11.s, z16.h, z0.h[3]\n"
       "addvl x10, x10, #4\n"
       "11:"  // Height 1: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -246,17 +246,17 @@
       "bne 6b\n"
       "tbz %x[flags], #1, 12f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
       "12:"  // Height 1: No activation
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -296,15 +296,15 @@
       "16:"  // Height 2: no bias
       "tbz %x[flags], #0, 17f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x9]\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 18f\n"
       "17:"  // Height 2: no accumulate
       "mov z8.b, #0x0\n"
@@ -320,12 +320,12 @@
       "19:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 21f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -333,130 +333,130 @@
       "b 21f\n"
       "20:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
       "21:"  // Height 2: input setup done
       "cmp x27, #0x8\n"
       "ble 23f\n"
       "22:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64614228  // bfdot z8.s, z17.h, z1.h[0]\n"
+      ".inst 0x6460422c  // bfdot z12.s, z17.h, z0.h[0]\n"
+      ".inst 0x64614209  // bfdot z9.s, z16.h, z1.h[0]\n"
+      ".inst 0x6460420d  // bfdot z13.s, z16.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461422a  // bfdot z10.s, z17.h, z1.h[0]\n"
+      ".inst 0x6460422e  // bfdot z14.s, z17.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
       "cmp x27, #0x8\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6461420b  // bfdot z11.s, z16.h, z1.h[0]\n"
+      ".inst 0x6460420f  // bfdot z15.s, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x64694228  // bfdot z8.s, z17.h, z1.h[1]\n"
+      ".inst 0x6468422c  // bfdot z12.s, z17.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x64694209  // bfdot z9.s, z16.h, z1.h[1]\n"
+      ".inst 0x6468420d  // bfdot z13.s, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x6469422a  // bfdot z10.s, z17.h, z1.h[1]\n"
+      ".inst 0x6468422e  // bfdot z14.s, z17.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6469420b  // bfdot z11.s, z16.h, z1.h[1]\n"
+      ".inst 0x6468420f  // bfdot z15.s, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x64714228  // bfdot z8.s, z17.h, z1.h[2]\n"
+      ".inst 0x6470422c  // bfdot z12.s, z17.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x64714209  // bfdot z9.s, z16.h, z1.h[2]\n"
+      ".inst 0x6470420d  // bfdot z13.s, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6471422a  // bfdot z10.s, z17.h, z1.h[2]\n"
+      ".inst 0x6470422e  // bfdot z14.s, z17.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6471420b  // bfdot z11.s, z16.h, z1.h[2]\n"
+      ".inst 0x6470420f  // bfdot z15.s, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x64794228  // bfdot z8.s, z17.h, z1.h[3]\n"
+      ".inst 0x6478422c  // bfdot z12.s, z17.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x64794209  // bfdot z9.s, z16.h, z1.h[3]\n"
+      ".inst 0x6478420d  // bfdot z13.s, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6479422a  // bfdot z10.s, z17.h, z1.h[3]\n"
+      ".inst 0x6478422e  // bfdot z14.s, z17.h, z0.h[3]\n"
+      ".inst 0x6479420b  // bfdot z11.s, z16.h, z1.h[3]\n"
+      ".inst 0x6478420f  // bfdot z15.s, z16.h, z0.h[3]\n"
       "bgt 22b\n"
       "23:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z0.h }, p0/Z, [x26]\n"
       "ld1rqh { z1.h }, p0/Z, [x25]\n"
       "subs x27, x27, #0x2\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64604228  // bfdot z8.s, z17.h, z0.h[0]\n"
+      ".inst 0x6461422c  // bfdot z12.s, z17.h, z1.h[0]\n"
+      ".inst 0x64604209  // bfdot z9.s, z16.h, z0.h[0]\n"
+      ".inst 0x6461420d  // bfdot z13.s, z16.h, z1.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6460422a  // bfdot z10.s, z17.h, z0.h[0]\n"
+      ".inst 0x6461422e  // bfdot z14.s, z17.h, z1.h[0]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x6460420b  // bfdot z11.s, z16.h, z0.h[0]\n"
+      ".inst 0x6461420f  // bfdot z15.s, z16.h, z1.h[0]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64684228  // bfdot z8.s, z17.h, z0.h[1]\n"
+      ".inst 0x6469422c  // bfdot z12.s, z17.h, z1.h[1]\n"
+      ".inst 0x64684209  // bfdot z9.s, z16.h, z0.h[1]\n"
+      ".inst 0x6469420d  // bfdot z13.s, z16.h, z1.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x2\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x6468422a  // bfdot z10.s, z17.h, z0.h[1]\n"
+      ".inst 0x6469422e  // bfdot z14.s, z17.h, z1.h[1]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x6468420b  // bfdot z11.s, z16.h, z0.h[1]\n"
+      ".inst 0x6469420f  // bfdot z15.s, z16.h, z1.h[1]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64704228  // bfdot z8.s, z17.h, z0.h[2]\n"
+      ".inst 0x6471422c  // bfdot z12.s, z17.h, z1.h[2]\n"
+      ".inst 0x64704209  // bfdot z9.s, z16.h, z0.h[2]\n"
+      ".inst 0x6471420d  // bfdot z13.s, z16.h, z1.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x2\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x6470422a  // bfdot z10.s, z17.h, z0.h[2]\n"
+      ".inst 0x6471422e  // bfdot z14.s, z17.h, z1.h[2]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x6470420b  // bfdot z11.s, z16.h, z0.h[2]\n"
+      ".inst 0x6471420f  // bfdot z15.s, z16.h, z1.h[2]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64784228  // bfdot z8.s, z17.h, z0.h[3]\n"
+      ".inst 0x6479422c  // bfdot z12.s, z17.h, z1.h[3]\n"
+      ".inst 0x64784209  // bfdot z9.s, z16.h, z0.h[3]\n"
+      ".inst 0x6479420d  // bfdot z13.s, z16.h, z1.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6478422a  // bfdot z10.s, z17.h, z0.h[3]\n"
+      ".inst 0x6479422e  // bfdot z14.s, z17.h, z1.h[3]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x6478420b  // bfdot z11.s, z16.h, z0.h[3]\n"
+      ".inst 0x6479420f  // bfdot z15.s, z16.h, z1.h[3]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -466,25 +466,25 @@
       "add x25, x9, x20, LSL #2\n"
       "tbz %x[flags], #1, 25f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmin z12.s, p5/M, z12.s, z17.s\n"
+      "fmin z13.s, p5/M, z13.s, z17.s\n"
+      "fmin z14.s, p5/M, z14.s, z17.s\n"
+      "fmin z15.s, p5/M, z15.s, z17.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
+      "fmax z12.s, p5/M, z12.s, z16.s\n"
+      "fmax z13.s, p5/M, z13.s, z16.s\n"
+      "fmax z14.s, p5/M, z14.s, z16.s\n"
+      "fmax z15.s, p5/M, z15.s, z16.s\n"
       "25:"  // Height 2: No activation
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -532,20 +532,20 @@
       "29:"  // Height 3: no bias
       "tbz %x[flags], #0, 30f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x9]\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x24]\n"
-      "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x20]\n"
+      "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 31f\n"
       "30:"  // Height 3: no accumulate
       "mov z8.b, #0x0\n"
@@ -565,13 +565,13 @@
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 34f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -580,86 +580,86 @@
       "b 34f\n"
       "33:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "34:"  // Height 3: input setup done
       "cmp x27, #0x8\n"
       "ble 36f\n"
       "35:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
       "ld1rqh { z1.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
-      "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1rqh { z0.h }, p0/Z, [x24]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      ".inst 0x646242a8  // bfdot z8.s, z21.h, z2.h[0]\n"
+      ".inst 0x646142ac  // bfdot z12.s, z21.h, z1.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646042b0  // bfdot z16.s, z21.h, z0.h[0]\n"
+      ".inst 0x64624289  // bfdot z9.s, z20.h, z2.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6461428d  // bfdot z13.s, z20.h, z1.h[0]\n"
+      ".inst 0x64604291  // bfdot z17.s, z20.h, z0.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "cmp x27, #0x8\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646242aa  // bfdot z10.s, z21.h, z2.h[0]\n"
+      ".inst 0x646142ae  // bfdot z14.s, z21.h, z1.h[0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x646042b2  // bfdot z18.s, z21.h, z0.h[0]\n"
+      ".inst 0x6462428b  // bfdot z11.s, z20.h, z2.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6461428f  // bfdot z15.s, z20.h, z1.h[0]\n"
+      ".inst 0x64604293  // bfdot z19.s, z20.h, z0.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x646a42a8  // bfdot z8.s, z21.h, z2.h[1]\n"
+      ".inst 0x646942ac  // bfdot z12.s, z21.h, z1.h[1]\n"
+      ".inst 0x646842b0  // bfdot z16.s, z21.h, z0.h[1]\n"
+      ".inst 0x646a4289  // bfdot z9.s, z20.h, z2.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6469428d  // bfdot z13.s, z20.h, z1.h[1]\n"
+      ".inst 0x64684291  // bfdot z17.s, z20.h, z0.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x646a42aa  // bfdot z10.s, z21.h, z2.h[1]\n"
+      ".inst 0x646942ae  // bfdot z14.s, z21.h, z1.h[1]\n"
+      ".inst 0x646842b2  // bfdot z18.s, z21.h, z0.h[1]\n"
+      ".inst 0x646a428b  // bfdot z11.s, z20.h, z2.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6469428f  // bfdot z15.s, z20.h, z1.h[1]\n"
+      ".inst 0x64684293  // bfdot z19.s, z20.h, z0.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x647242a8  // bfdot z8.s, z21.h, z2.h[2]\n"
+      ".inst 0x647142ac  // bfdot z12.s, z21.h, z1.h[2]\n"
+      ".inst 0x647042b0  // bfdot z16.s, z21.h, z0.h[2]\n"
+      ".inst 0x64724289  // bfdot z9.s, z20.h, z2.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6471428d  // bfdot z13.s, z20.h, z1.h[2]\n"
+      ".inst 0x64704291  // bfdot z17.s, z20.h, z0.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x647242aa  // bfdot z10.s, z21.h, z2.h[2]\n"
+      ".inst 0x647142ae  // bfdot z14.s, z21.h, z1.h[2]\n"
+      ".inst 0x647042b2  // bfdot z18.s, z21.h, z0.h[2]\n"
+      ".inst 0x6472428b  // bfdot z11.s, z20.h, z2.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6471428f  // bfdot z15.s, z20.h, z1.h[2]\n"
+      ".inst 0x64704293  // bfdot z19.s, z20.h, z0.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x647a42a8  // bfdot z8.s, z21.h, z2.h[3]\n"
+      ".inst 0x647942ac  // bfdot z12.s, z21.h, z1.h[3]\n"
+      ".inst 0x647842b0  // bfdot z16.s, z21.h, z0.h[3]\n"
+      ".inst 0x647a4289  // bfdot z9.s, z20.h, z2.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6479428d  // bfdot z13.s, z20.h, z1.h[3]\n"
+      ".inst 0x64784291  // bfdot z17.s, z20.h, z0.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x647a42aa  // bfdot z10.s, z21.h, z2.h[3]\n"
+      ".inst 0x647942ae  // bfdot z14.s, z21.h, z1.h[3]\n"
+      ".inst 0x647842b2  // bfdot z18.s, z21.h, z0.h[3]\n"
+      ".inst 0x647a428b  // bfdot z11.s, z20.h, z2.h[3]\n"
+      ".inst 0x6479428f  // bfdot z15.s, z20.h, z1.h[3]\n"
+      ".inst 0x64784293  // bfdot z19.s, z20.h, z0.h[3]\n"
       "bgt 35b\n"
       "36:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
@@ -667,79 +667,79 @@
       "ld1rqh { z1.h }, p0/Z, [x25]\n"
       "subs x27, x27, #0x2\n"
       "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      ".inst 0x646042a8  // bfdot z8.s, z21.h, z0.h[0]\n"
+      ".inst 0x646142ac  // bfdot z12.s, z21.h, z1.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646242b0  // bfdot z16.s, z21.h, z2.h[0]\n"
+      ".inst 0x64604289  // bfdot z9.s, z20.h, z0.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6461428d  // bfdot z13.s, z20.h, z1.h[0]\n"
+      ".inst 0x64624291  // bfdot z17.s, z20.h, z2.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      ".inst 0x646042aa  // bfdot z10.s, z21.h, z0.h[0]\n"
+      ".inst 0x646142ae  // bfdot z14.s, z21.h, z1.h[0]\n"
+      ".inst 0x646242b2  // bfdot z18.s, z21.h, z2.h[0]\n"
+      ".inst 0x6460428b  // bfdot z11.s, z20.h, z0.h[0]\n"
+      ".inst 0x6461428f  // bfdot z15.s, z20.h, z1.h[0]\n"
+      ".inst 0x64624293  // bfdot z19.s, z20.h, z2.h[0]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646842a8  // bfdot z8.s, z21.h, z0.h[1]\n"
+      ".inst 0x646942ac  // bfdot z12.s, z21.h, z1.h[1]\n"
+      ".inst 0x646a42b0  // bfdot z16.s, z21.h, z2.h[1]\n"
+      ".inst 0x64684289  // bfdot z9.s, z20.h, z0.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x2\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6469428d  // bfdot z13.s, z20.h, z1.h[1]\n"
+      ".inst 0x646a4291  // bfdot z17.s, z20.h, z2.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      ".inst 0x646842aa  // bfdot z10.s, z21.h, z0.h[1]\n"
+      ".inst 0x646942ae  // bfdot z14.s, z21.h, z1.h[1]\n"
+      ".inst 0x646a42b2  // bfdot z18.s, z21.h, z2.h[1]\n"
+      ".inst 0x6468428b  // bfdot z11.s, z20.h, z0.h[1]\n"
+      ".inst 0x6469428f  // bfdot z15.s, z20.h, z1.h[1]\n"
+      ".inst 0x646a4293  // bfdot z19.s, z20.h, z2.h[1]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x647042a8  // bfdot z8.s, z21.h, z0.h[2]\n"
+      ".inst 0x647142ac  // bfdot z12.s, z21.h, z1.h[2]\n"
+      ".inst 0x647242b0  // bfdot z16.s, z21.h, z2.h[2]\n"
+      ".inst 0x64704289  // bfdot z9.s, z20.h, z0.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x2\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471428d  // bfdot z13.s, z20.h, z1.h[2]\n"
+      ".inst 0x64724291  // bfdot z17.s, z20.h, z2.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      ".inst 0x647042aa  // bfdot z10.s, z21.h, z0.h[2]\n"
+      ".inst 0x647142ae  // bfdot z14.s, z21.h, z1.h[2]\n"
+      ".inst 0x647242b2  // bfdot z18.s, z21.h, z2.h[2]\n"
+      ".inst 0x6470428b  // bfdot z11.s, z20.h, z0.h[2]\n"
+      ".inst 0x6471428f  // bfdot z15.s, z20.h, z1.h[2]\n"
+      ".inst 0x64724293  // bfdot z19.s, z20.h, z2.h[2]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x647842a8  // bfdot z8.s, z21.h, z0.h[3]\n"
+      ".inst 0x647942ac  // bfdot z12.s, z21.h, z1.h[3]\n"
+      ".inst 0x647a42b0  // bfdot z16.s, z21.h, z2.h[3]\n"
+      ".inst 0x64784289  // bfdot z9.s, z20.h, z0.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6479428d  // bfdot z13.s, z20.h, z1.h[3]\n"
+      ".inst 0x647a4291  // bfdot z17.s, z20.h, z2.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x647842aa  // bfdot z10.s, z21.h, z0.h[3]\n"
+      ".inst 0x647942ae  // bfdot z14.s, z21.h, z1.h[3]\n"
+      ".inst 0x647a42b2  // bfdot z18.s, z21.h, z2.h[3]\n"
+      ".inst 0x6478428b  // bfdot z11.s, z20.h, z0.h[3]\n"
+      ".inst 0x6479428f  // bfdot z15.s, z20.h, z1.h[3]\n"
+      ".inst 0x647a4293  // bfdot z19.s, z20.h, z2.h[3]\n"
       "37:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -750,33 +750,33 @@
       "add x24, x25, x20, LSL #2\n"
       "tbz %x[flags], #1, 38f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z21.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "ld1rw { z20.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z21.s\n"
+      "fmin z9.s, p5/M, z9.s, z21.s\n"
+      "fmin z10.s, p5/M, z10.s, z21.s\n"
+      "fmin z11.s, p5/M, z11.s, z21.s\n"
+      "fmin z12.s, p5/M, z12.s, z21.s\n"
+      "fmin z13.s, p5/M, z13.s, z21.s\n"
+      "fmin z14.s, p5/M, z14.s, z21.s\n"
+      "fmin z15.s, p5/M, z15.s, z21.s\n"
+      "fmin z16.s, p5/M, z16.s, z21.s\n"
+      "fmin z17.s, p5/M, z17.s, z21.s\n"
+      "fmin z18.s, p5/M, z18.s, z21.s\n"
+      "fmin z19.s, p5/M, z19.s, z21.s\n"
+      "fmax z8.s, p5/M, z8.s, z20.s\n"
+      "fmax z9.s, p5/M, z9.s, z20.s\n"
+      "fmax z10.s, p5/M, z10.s, z20.s\n"
+      "fmax z11.s, p5/M, z11.s, z20.s\n"
+      "fmax z12.s, p5/M, z12.s, z20.s\n"
+      "fmax z13.s, p5/M, z13.s, z20.s\n"
+      "fmax z14.s, p5/M, z14.s, z20.s\n"
+      "fmax z15.s, p5/M, z15.s, z20.s\n"
+      "fmax z16.s, p5/M, z16.s, z20.s\n"
+      "fmax z17.s, p5/M, z17.s, z20.s\n"
+      "fmax z18.s, p5/M, z18.s, z20.s\n"
+      "fmax z19.s, p5/M, z19.s, z20.s\n"
       "38:"  // Height 3: No activation
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -832,25 +832,25 @@
       "42:"  // Height 4: no bias
       "tbz %x[flags], #0, 43f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x9]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x24]\n"
-      "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x21]\n"
+      "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 44f\n"
       "43:"  // Height 4: no accumulate
       "mov z8.b, #0x0\n"
@@ -874,14 +874,14 @@
       "45:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 46f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -891,105 +891,105 @@
       "b 47f\n"
       "46:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "47:"  // Height 4: input setup done
       "cmp x27, #0x8\n"
       "ble 49f\n"
       "48:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z3.h }, p0/Z, [x26]\n"
+      "ld1rqh { z2.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
-      "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z1.h }, p0/Z, [x24]\n"
+      "ld1rqh { z0.h }, p0/Z, [x23]\n"
       "cmp x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64634328  // bfdot z8.s, z25.h, z3.h[0]\n"
+      ".inst 0x6462432c  // bfdot z12.s, z25.h, z2.h[0]\n"
+      ".inst 0x64614330  // bfdot z16.s, z25.h, z1.h[0]\n"
+      ".inst 0x64604334  // bfdot z20.s, z25.h, z0.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      ".inst 0x64634309  // bfdot z9.s, z24.h, z3.h[0]\n"
+      ".inst 0x6462430d  // bfdot z13.s, z24.h, z2.h[0]\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x64614311  // bfdot z17.s, z24.h, z1.h[0]\n"
+      ".inst 0x64604315  // bfdot z21.s, z24.h, z0.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6463432a  // bfdot z10.s, z25.h, z3.h[0]\n"
+      ".inst 0x6462432e  // bfdot z14.s, z25.h, z2.h[0]\n"
+      ".inst 0x64614332  // bfdot z18.s, z25.h, z1.h[0]\n"
+      ".inst 0x64604336  // bfdot z22.s, z25.h, z0.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6463430b  // bfdot z11.s, z24.h, z3.h[0]\n"
+      ".inst 0x6462430f  // bfdot z15.s, z24.h, z2.h[0]\n"
+      ".inst 0x64614313  // bfdot z19.s, z24.h, z1.h[0]\n"
+      ".inst 0x64604317  // bfdot z23.s, z24.h, z0.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x646b4328  // bfdot z8.s, z25.h, z3.h[1]\n"
+      ".inst 0x646a432c  // bfdot z12.s, z25.h, z2.h[1]\n"
+      ".inst 0x64694330  // bfdot z16.s, z25.h, z1.h[1]\n"
+      ".inst 0x64684334  // bfdot z20.s, z25.h, z0.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x646b4309  // bfdot z9.s, z24.h, z3.h[1]\n"
+      ".inst 0x646a430d  // bfdot z13.s, z24.h, z2.h[1]\n"
+      ".inst 0x64694311  // bfdot z17.s, z24.h, z1.h[1]\n"
+      ".inst 0x64684315  // bfdot z21.s, z24.h, z0.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      ".inst 0x646b432a  // bfdot z10.s, z25.h, z3.h[1]\n"
+      ".inst 0x646a432e  // bfdot z14.s, z25.h, z2.h[1]\n"
+      ".inst 0x64694332  // bfdot z18.s, z25.h, z1.h[1]\n"
+      ".inst 0x64684336  // bfdot z22.s, z25.h, z0.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x646b430b  // bfdot z11.s, z24.h, z3.h[1]\n"
+      ".inst 0x646a430f  // bfdot z15.s, z24.h, z2.h[1]\n"
+      ".inst 0x64694313  // bfdot z19.s, z24.h, z1.h[1]\n"
+      ".inst 0x64684317  // bfdot z23.s, z24.h, z0.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x64734328  // bfdot z8.s, z25.h, z3.h[2]\n"
+      ".inst 0x6472432c  // bfdot z12.s, z25.h, z2.h[2]\n"
+      ".inst 0x64714330  // bfdot z16.s, z25.h, z1.h[2]\n"
+      ".inst 0x64704334  // bfdot z20.s, z25.h, z0.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x64734309  // bfdot z9.s, z24.h, z3.h[2]\n"
+      ".inst 0x6472430d  // bfdot z13.s, z24.h, z2.h[2]\n"
+      ".inst 0x64714311  // bfdot z17.s, z24.h, z1.h[2]\n"
+      ".inst 0x64704315  // bfdot z21.s, z24.h, z0.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6473432a  // bfdot z10.s, z25.h, z3.h[2]\n"
+      ".inst 0x6472432e  // bfdot z14.s, z25.h, z2.h[2]\n"
+      ".inst 0x64714332  // bfdot z18.s, z25.h, z1.h[2]\n"
+      ".inst 0x64704336  // bfdot z22.s, z25.h, z0.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6473430b  // bfdot z11.s, z24.h, z3.h[2]\n"
+      ".inst 0x6472430f  // bfdot z15.s, z24.h, z2.h[2]\n"
+      ".inst 0x64714313  // bfdot z19.s, z24.h, z1.h[2]\n"
+      ".inst 0x64704317  // bfdot z23.s, z24.h, z0.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x647b4328  // bfdot z8.s, z25.h, z3.h[3]\n"
+      ".inst 0x647a432c  // bfdot z12.s, z25.h, z2.h[3]\n"
+      ".inst 0x64794330  // bfdot z16.s, z25.h, z1.h[3]\n"
+      ".inst 0x64784334  // bfdot z20.s, z25.h, z0.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x647b4309  // bfdot z9.s, z24.h, z3.h[3]\n"
+      ".inst 0x647a430d  // bfdot z13.s, z24.h, z2.h[3]\n"
+      ".inst 0x64794311  // bfdot z17.s, z24.h, z1.h[3]\n"
+      ".inst 0x64784315  // bfdot z21.s, z24.h, z0.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x647b432a  // bfdot z10.s, z25.h, z3.h[3]\n"
+      ".inst 0x647a432e  // bfdot z14.s, z25.h, z2.h[3]\n"
+      ".inst 0x64794332  // bfdot z18.s, z25.h, z1.h[3]\n"
+      ".inst 0x64784336  // bfdot z22.s, z25.h, z0.h[3]\n"
+      ".inst 0x647b430b  // bfdot z11.s, z24.h, z3.h[3]\n"
+      ".inst 0x647a430f  // bfdot z15.s, z24.h, z2.h[3]\n"
+      ".inst 0x64794313  // bfdot z19.s, z24.h, z1.h[3]\n"
+      ".inst 0x64784317  // bfdot z23.s, z24.h, z0.h[3]\n"
       "bgt 48b\n"
       "49:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
@@ -998,95 +998,95 @@
       "subs x27, x27, #0x2\n"
       "ld1rqh { z2.h }, p0/Z, [x24]\n"
       "ld1rqh { z3.h }, p0/Z, [x23]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64604328  // bfdot z8.s, z25.h, z0.h[0]\n"
+      ".inst 0x6461432c  // bfdot z12.s, z25.h, z1.h[0]\n"
+      ".inst 0x64624330  // bfdot z16.s, z25.h, z2.h[0]\n"
+      ".inst 0x64634334  // bfdot z20.s, z25.h, z3.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x64604309  // bfdot z9.s, z24.h, z0.h[0]\n"
+      ".inst 0x6461430d  // bfdot z13.s, z24.h, z1.h[0]\n"
+      ".inst 0x64624311  // bfdot z17.s, z24.h, z2.h[0]\n"
+      ".inst 0x64634315  // bfdot z21.s, z24.h, z3.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
+      ".inst 0x6460432a  // bfdot z10.s, z25.h, z0.h[0]\n"
+      ".inst 0x6461432e  // bfdot z14.s, z25.h, z1.h[0]\n"
+      ".inst 0x64624332  // bfdot z18.s, z25.h, z2.h[0]\n"
+      ".inst 0x64634336  // bfdot z22.s, z25.h, z3.h[0]\n"
+      ".inst 0x6460430b  // bfdot z11.s, z24.h, z0.h[0]\n"
+      ".inst 0x6461430f  // bfdot z15.s, z24.h, z1.h[0]\n"
+      ".inst 0x64624313  // bfdot z19.s, z24.h, z2.h[0]\n"
+      ".inst 0x64634317  // bfdot z23.s, z24.h, z3.h[0]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64684328  // bfdot z8.s, z25.h, z0.h[1]\n"
+      ".inst 0x6469432c  // bfdot z12.s, z25.h, z1.h[1]\n"
+      ".inst 0x646a4330  // bfdot z16.s, z25.h, z2.h[1]\n"
+      ".inst 0x646b4334  // bfdot z20.s, z25.h, z3.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x2\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x64684309  // bfdot z9.s, z24.h, z0.h[1]\n"
+      ".inst 0x6469430d  // bfdot z13.s, z24.h, z1.h[1]\n"
+      ".inst 0x646a4311  // bfdot z17.s, z24.h, z2.h[1]\n"
+      ".inst 0x646b4315  // bfdot z21.s, z24.h, z3.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
+      ".inst 0x6468432a  // bfdot z10.s, z25.h, z0.h[1]\n"
+      ".inst 0x6469432e  // bfdot z14.s, z25.h, z1.h[1]\n"
+      ".inst 0x646a4332  // bfdot z18.s, z25.h, z2.h[1]\n"
+      ".inst 0x646b4336  // bfdot z22.s, z25.h, z3.h[1]\n"
+      ".inst 0x6468430b  // bfdot z11.s, z24.h, z0.h[1]\n"
+      ".inst 0x6469430f  // bfdot z15.s, z24.h, z1.h[1]\n"
+      ".inst 0x646a4313  // bfdot z19.s, z24.h, z2.h[1]\n"
+      ".inst 0x646b4317  // bfdot z23.s, z24.h, z3.h[1]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64704328  // bfdot z8.s, z25.h, z0.h[2]\n"
+      ".inst 0x6471432c  // bfdot z12.s, z25.h, z1.h[2]\n"
+      ".inst 0x64724330  // bfdot z16.s, z25.h, z2.h[2]\n"
+      ".inst 0x64734334  // bfdot z20.s, z25.h, z3.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x2\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x64704309  // bfdot z9.s, z24.h, z0.h[2]\n"
+      ".inst 0x6471430d  // bfdot z13.s, z24.h, z1.h[2]\n"
+      ".inst 0x64724311  // bfdot z17.s, z24.h, z2.h[2]\n"
+      ".inst 0x64734315  // bfdot z21.s, z24.h, z3.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
+      ".inst 0x6470432a  // bfdot z10.s, z25.h, z0.h[2]\n"
+      ".inst 0x6471432e  // bfdot z14.s, z25.h, z1.h[2]\n"
+      ".inst 0x64724332  // bfdot z18.s, z25.h, z2.h[2]\n"
+      ".inst 0x64734336  // bfdot z22.s, z25.h, z3.h[2]\n"
+      ".inst 0x6470430b  // bfdot z11.s, z24.h, z0.h[2]\n"
+      ".inst 0x6471430f  // bfdot z15.s, z24.h, z1.h[2]\n"
+      ".inst 0x64724313  // bfdot z19.s, z24.h, z2.h[2]\n"
+      ".inst 0x64734317  // bfdot z23.s, z24.h, z3.h[2]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64784328  // bfdot z8.s, z25.h, z0.h[3]\n"
+      ".inst 0x6479432c  // bfdot z12.s, z25.h, z1.h[3]\n"
+      ".inst 0x647a4330  // bfdot z16.s, z25.h, z2.h[3]\n"
+      ".inst 0x647b4334  // bfdot z20.s, z25.h, z3.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x64784309  // bfdot z9.s, z24.h, z0.h[3]\n"
+      ".inst 0x6479430d  // bfdot z13.s, z24.h, z1.h[3]\n"
+      ".inst 0x647a4311  // bfdot z17.s, z24.h, z2.h[3]\n"
+      ".inst 0x647b4315  // bfdot z21.s, z24.h, z3.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      ".inst 0x6478432a  // bfdot z10.s, z25.h, z0.h[3]\n"
+      ".inst 0x6479432e  // bfdot z14.s, z25.h, z1.h[3]\n"
+      ".inst 0x647a4332  // bfdot z18.s, z25.h, z2.h[3]\n"
+      ".inst 0x647b4336  // bfdot z22.s, z25.h, z3.h[3]\n"
+      ".inst 0x6478430b  // bfdot z11.s, z24.h, z0.h[3]\n"
+      ".inst 0x6479430f  // bfdot z15.s, z24.h, z1.h[3]\n"
+      ".inst 0x647a4313  // bfdot z19.s, z24.h, z2.h[3]\n"
+      ".inst 0x647b4317  // bfdot z23.s, z24.h, z3.h[3]\n"
       "50:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1098,41 +1098,41 @@
       "add x23, x24, x20, LSL #2\n"
       "tbz %x[flags], #1, 51f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z25.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmin z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z1.s\n"
-      "fmin z22.s, p5/M, z22.s, z1.s\n"
-      "fmin z23.s, p5/M, z23.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
-      "fmax z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z0.s\n"
-      "fmax z22.s, p5/M, z22.s, z0.s\n"
-      "fmax z23.s, p5/M, z23.s, z0.s\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z25.s\n"
+      "fmin z9.s, p5/M, z9.s, z25.s\n"
+      "fmin z10.s, p5/M, z10.s, z25.s\n"
+      "fmin z11.s, p5/M, z11.s, z25.s\n"
+      "fmin z12.s, p5/M, z12.s, z25.s\n"
+      "fmin z13.s, p5/M, z13.s, z25.s\n"
+      "fmin z14.s, p5/M, z14.s, z25.s\n"
+      "fmin z15.s, p5/M, z15.s, z25.s\n"
+      "fmin z16.s, p5/M, z16.s, z25.s\n"
+      "fmin z17.s, p5/M, z17.s, z25.s\n"
+      "fmin z18.s, p5/M, z18.s, z25.s\n"
+      "fmin z19.s, p5/M, z19.s, z25.s\n"
+      "fmin z20.s, p5/M, z20.s, z25.s\n"
+      "fmin z21.s, p5/M, z21.s, z25.s\n"
+      "fmin z22.s, p5/M, z22.s, z25.s\n"
+      "fmin z23.s, p5/M, z23.s, z25.s\n"
+      "fmax z8.s, p5/M, z8.s, z24.s\n"
+      "fmax z9.s, p5/M, z9.s, z24.s\n"
+      "fmax z10.s, p5/M, z10.s, z24.s\n"
+      "fmax z11.s, p5/M, z11.s, z24.s\n"
+      "fmax z12.s, p5/M, z12.s, z24.s\n"
+      "fmax z13.s, p5/M, z13.s, z24.s\n"
+      "fmax z14.s, p5/M, z14.s, z24.s\n"
+      "fmax z15.s, p5/M, z15.s, z24.s\n"
+      "fmax z16.s, p5/M, z16.s, z24.s\n"
+      "fmax z17.s, p5/M, z17.s, z24.s\n"
+      "fmax z18.s, p5/M, z18.s, z24.s\n"
+      "fmax z19.s, p5/M, z19.s, z24.s\n"
+      "fmax z20.s, p5/M, z20.s, z24.s\n"
+      "fmax z21.s, p5/M, z21.s, z24.s\n"
+      "fmax z22.s, p5/M, z22.s, z24.s\n"
+      "fmax z23.s, p5/M, z23.s, z24.s\n"
       "51:"  // Height 4: No activation
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -1196,30 +1196,30 @@
       "55:"  // Height 5: no bias
       "tbz %x[flags], #0, 56f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z8.s }, p4/Z, [x9]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x24]\n"
-      "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z24.s }, p4/Z, [x22]\n"
-      "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x22]\n"
+      "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x20]\n"
+      "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 57f\n"
       "56:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -1247,15 +1247,15 @@
       "58:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 59f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 60f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1266,124 +1266,124 @@
       "b 60f\n"
       "59:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "60:"  // Height 5: input setup done
       "cmp x27, #0x8\n"
       "ble 62f\n"
       "61:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z4.h }, p0/Z, [x26]\n"
+      "ld1rqh { z3.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
       "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z1.h }, p0/Z, [x23]\n"
       "cmp x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      "ld1rqh { z4.h }, p0/Z, [x22]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
+      "ld1rqh { z0.h }, p0/Z, [x22]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      ".inst 0x646443a8  // bfdot z8.s, z29.h, z4.h[0]\n"
+      ".inst 0x646343ac  // bfdot z12.s, z29.h, z3.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646243b0  // bfdot z16.s, z29.h, z2.h[0]\n"
+      ".inst 0x646143b4  // bfdot z20.s, z29.h, z1.h[0]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x646043b8  // bfdot z24.s, z29.h, z0.h[0]\n"
+      ".inst 0x64644389  // bfdot z9.s, z28.h, z4.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      ".inst 0x6463438d  // bfdot z13.s, z28.h, z3.h[0]\n"
+      ".inst 0x64624391  // bfdot z17.s, z28.h, z2.h[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
-      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
-      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
-      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
-      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x64614395  // bfdot z21.s, z28.h, z1.h[0]\n"
+      ".inst 0x64604399  // bfdot z25.s, z28.h, z0.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x646443aa  // bfdot z10.s, z29.h, z4.h[0]\n"
+      ".inst 0x646343ae  // bfdot z14.s, z29.h, z3.h[0]\n"
+      ".inst 0x646243b2  // bfdot z18.s, z29.h, z2.h[0]\n"
+      ".inst 0x646143b6  // bfdot z22.s, z29.h, z1.h[0]\n"
+      ".inst 0x646043ba  // bfdot z26.s, z29.h, z0.h[0]\n"
+      ".inst 0x6464438b  // bfdot z11.s, z28.h, z4.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6463438f  // bfdot z15.s, z28.h, z3.h[0]\n"
+      ".inst 0x64624393  // bfdot z19.s, z28.h, z2.h[0]\n"
+      ".inst 0x64614397  // bfdot z23.s, z28.h, z1.h[0]\n"
+      ".inst 0x6460439b  // bfdot z27.s, z28.h, z0.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x646c43a8  // bfdot z8.s, z29.h, z4.h[1]\n"
+      ".inst 0x646b43ac  // bfdot z12.s, z29.h, z3.h[1]\n"
+      ".inst 0x646a43b0  // bfdot z16.s, z29.h, z2.h[1]\n"
+      ".inst 0x646943b4  // bfdot z20.s, z29.h, z1.h[1]\n"
+      ".inst 0x646843b8  // bfdot z24.s, z29.h, z0.h[1]\n"
+      ".inst 0x646c4389  // bfdot z9.s, z28.h, z4.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x646b438d  // bfdot z13.s, z28.h, z3.h[1]\n"
+      ".inst 0x646a4391  // bfdot z17.s, z28.h, z2.h[1]\n"
+      ".inst 0x64694395  // bfdot z21.s, z28.h, z1.h[1]\n"
+      ".inst 0x64684399  // bfdot z25.s, z28.h, z0.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
-      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
-      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
-      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
+      ".inst 0x646c43aa  // bfdot z10.s, z29.h, z4.h[1]\n"
+      ".inst 0x646b43ae  // bfdot z14.s, z29.h, z3.h[1]\n"
+      ".inst 0x646a43b2  // bfdot z18.s, z29.h, z2.h[1]\n"
+      ".inst 0x646943b6  // bfdot z22.s, z29.h, z1.h[1]\n"
+      ".inst 0x646843ba  // bfdot z26.s, z29.h, z0.h[1]\n"
+      ".inst 0x646c438b  // bfdot z11.s, z28.h, z4.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x646b438f  // bfdot z15.s, z28.h, z3.h[1]\n"
+      ".inst 0x646a4393  // bfdot z19.s, z28.h, z2.h[1]\n"
+      ".inst 0x64694397  // bfdot z23.s, z28.h, z1.h[1]\n"
+      ".inst 0x6468439b  // bfdot z27.s, z28.h, z0.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x647443a8  // bfdot z8.s, z29.h, z4.h[2]\n"
+      ".inst 0x647343ac  // bfdot z12.s, z29.h, z3.h[2]\n"
+      ".inst 0x647243b0  // bfdot z16.s, z29.h, z2.h[2]\n"
+      ".inst 0x647143b4  // bfdot z20.s, z29.h, z1.h[2]\n"
+      ".inst 0x647043b8  // bfdot z24.s, z29.h, z0.h[2]\n"
+      ".inst 0x64744389  // bfdot z9.s, z28.h, z4.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6473438d  // bfdot z13.s, z28.h, z3.h[2]\n"
+      ".inst 0x64724391  // bfdot z17.s, z28.h, z2.h[2]\n"
+      ".inst 0x64714395  // bfdot z21.s, z28.h, z1.h[2]\n"
+      ".inst 0x64704399  // bfdot z25.s, z28.h, z0.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x647443aa  // bfdot z10.s, z29.h, z4.h[2]\n"
+      ".inst 0x647343ae  // bfdot z14.s, z29.h, z3.h[2]\n"
+      ".inst 0x647243b2  // bfdot z18.s, z29.h, z2.h[2]\n"
+      ".inst 0x647143b6  // bfdot z22.s, z29.h, z1.h[2]\n"
+      ".inst 0x647043ba  // bfdot z26.s, z29.h, z0.h[2]\n"
+      ".inst 0x6474438b  // bfdot z11.s, z28.h, z4.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6473438f  // bfdot z15.s, z28.h, z3.h[2]\n"
+      ".inst 0x64724393  // bfdot z19.s, z28.h, z2.h[2]\n"
+      ".inst 0x64714397  // bfdot z23.s, z28.h, z1.h[2]\n"
+      ".inst 0x6470439b  // bfdot z27.s, z28.h, z0.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x647c43a8  // bfdot z8.s, z29.h, z4.h[3]\n"
+      ".inst 0x647b43ac  // bfdot z12.s, z29.h, z3.h[3]\n"
+      ".inst 0x647a43b0  // bfdot z16.s, z29.h, z2.h[3]\n"
+      ".inst 0x647943b4  // bfdot z20.s, z29.h, z1.h[3]\n"
+      ".inst 0x647843b8  // bfdot z24.s, z29.h, z0.h[3]\n"
+      ".inst 0x647c4389  // bfdot z9.s, z28.h, z4.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x647b438d  // bfdot z13.s, z28.h, z3.h[3]\n"
+      ".inst 0x647a4391  // bfdot z17.s, z28.h, z2.h[3]\n"
+      ".inst 0x64794395  // bfdot z21.s, z28.h, z1.h[3]\n"
+      ".inst 0x64784399  // bfdot z25.s, z28.h, z0.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x647c43aa  // bfdot z10.s, z29.h, z4.h[3]\n"
+      ".inst 0x647b43ae  // bfdot z14.s, z29.h, z3.h[3]\n"
+      ".inst 0x647a43b2  // bfdot z18.s, z29.h, z2.h[3]\n"
+      ".inst 0x647943b6  // bfdot z22.s, z29.h, z1.h[3]\n"
+      ".inst 0x647843ba  // bfdot z26.s, z29.h, z0.h[3]\n"
+      ".inst 0x647c438b  // bfdot z11.s, z28.h, z4.h[3]\n"
+      ".inst 0x647b438f  // bfdot z15.s, z28.h, z3.h[3]\n"
+      ".inst 0x647a4393  // bfdot z19.s, z28.h, z2.h[3]\n"
+      ".inst 0x64794397  // bfdot z23.s, z28.h, z1.h[3]\n"
+      ".inst 0x6478439b  // bfdot z27.s, z28.h, z0.h[3]\n"
       "bgt 61b\n"
       "62:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
@@ -1393,111 +1393,111 @@
       "ld1rqh { z2.h }, p0/Z, [x24]\n"
       "ld1rqh { z3.h }, p0/Z, [x23]\n"
       "ld1rqh { z4.h }, p0/Z, [x22]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
-      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
-      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      ".inst 0x646043a8  // bfdot z8.s, z29.h, z0.h[0]\n"
+      ".inst 0x646143ac  // bfdot z12.s, z29.h, z1.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646243b0  // bfdot z16.s, z29.h, z2.h[0]\n"
+      ".inst 0x646343b4  // bfdot z20.s, z29.h, z3.h[0]\n"
+      ".inst 0x646443b8  // bfdot z24.s, z29.h, z4.h[0]\n"
+      ".inst 0x64604389  // bfdot z9.s, z28.h, z0.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6461438d  // bfdot z13.s, z28.h, z1.h[0]\n"
+      ".inst 0x64624391  // bfdot z17.s, z28.h, z2.h[0]\n"
+      ".inst 0x64634395  // bfdot z21.s, z28.h, z3.h[0]\n"
+      ".inst 0x64644399  // bfdot z25.s, z28.h, z4.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
-      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
-      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
+      ".inst 0x646043aa  // bfdot z10.s, z29.h, z0.h[0]\n"
+      ".inst 0x646143ae  // bfdot z14.s, z29.h, z1.h[0]\n"
+      ".inst 0x646243b2  // bfdot z18.s, z29.h, z2.h[0]\n"
+      ".inst 0x646343b6  // bfdot z22.s, z29.h, z3.h[0]\n"
+      ".inst 0x646443ba  // bfdot z26.s, z29.h, z4.h[0]\n"
+      ".inst 0x6460438b  // bfdot z11.s, z28.h, z0.h[0]\n"
+      ".inst 0x6461438f  // bfdot z15.s, z28.h, z1.h[0]\n"
+      ".inst 0x64624393  // bfdot z19.s, z28.h, z2.h[0]\n"
+      ".inst 0x64634397  // bfdot z23.s, z28.h, z3.h[0]\n"
+      ".inst 0x6464439b  // bfdot z27.s, z28.h, z4.h[0]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646843a8  // bfdot z8.s, z29.h, z0.h[1]\n"
+      ".inst 0x646943ac  // bfdot z12.s, z29.h, z1.h[1]\n"
+      ".inst 0x646a43b0  // bfdot z16.s, z29.h, z2.h[1]\n"
+      ".inst 0x646b43b4  // bfdot z20.s, z29.h, z3.h[1]\n"
       "subs x27, x27, #0x2\n"
-      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x646c43b8  // bfdot z24.s, z29.h, z4.h[1]\n"
+      ".inst 0x64684389  // bfdot z9.s, z28.h, z0.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6469438d  // bfdot z13.s, z28.h, z1.h[1]\n"
+      ".inst 0x646a4391  // bfdot z17.s, z28.h, z2.h[1]\n"
+      ".inst 0x646b4395  // bfdot z21.s, z28.h, z3.h[1]\n"
+      ".inst 0x646c4399  // bfdot z25.s, z28.h, z4.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
-      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
+      ".inst 0x646843aa  // bfdot z10.s, z29.h, z0.h[1]\n"
+      ".inst 0x646943ae  // bfdot z14.s, z29.h, z1.h[1]\n"
+      ".inst 0x646a43b2  // bfdot z18.s, z29.h, z2.h[1]\n"
+      ".inst 0x646b43b6  // bfdot z22.s, z29.h, z3.h[1]\n"
+      ".inst 0x646c43ba  // bfdot z26.s, z29.h, z4.h[1]\n"
+      ".inst 0x6468438b  // bfdot z11.s, z28.h, z0.h[1]\n"
+      ".inst 0x6469438f  // bfdot z15.s, z28.h, z1.h[1]\n"
+      ".inst 0x646a4393  // bfdot z19.s, z28.h, z2.h[1]\n"
+      ".inst 0x646b4397  // bfdot z23.s, z28.h, z3.h[1]\n"
+      ".inst 0x646c439b  // bfdot z27.s, z28.h, z4.h[1]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x647043a8  // bfdot z8.s, z29.h, z0.h[2]\n"
+      ".inst 0x647143ac  // bfdot z12.s, z29.h, z1.h[2]\n"
+      ".inst 0x647243b0  // bfdot z16.s, z29.h, z2.h[2]\n"
+      ".inst 0x647343b4  // bfdot z20.s, z29.h, z3.h[2]\n"
       "subs x27, x27, #0x2\n"
-      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x647443b8  // bfdot z24.s, z29.h, z4.h[2]\n"
+      ".inst 0x64704389  // bfdot z9.s, z28.h, z0.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6471438d  // bfdot z13.s, z28.h, z1.h[2]\n"
+      ".inst 0x64724391  // bfdot z17.s, z28.h, z2.h[2]\n"
+      ".inst 0x64734395  // bfdot z21.s, z28.h, z3.h[2]\n"
+      ".inst 0x64744399  // bfdot z25.s, z28.h, z4.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
-      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
+      ".inst 0x647043aa  // bfdot z10.s, z29.h, z0.h[2]\n"
+      ".inst 0x647143ae  // bfdot z14.s, z29.h, z1.h[2]\n"
+      ".inst 0x647243b2  // bfdot z18.s, z29.h, z2.h[2]\n"
+      ".inst 0x647343b6  // bfdot z22.s, z29.h, z3.h[2]\n"
+      ".inst 0x647443ba  // bfdot z26.s, z29.h, z4.h[2]\n"
+      ".inst 0x6470438b  // bfdot z11.s, z28.h, z0.h[2]\n"
+      ".inst 0x6471438f  // bfdot z15.s, z28.h, z1.h[2]\n"
+      ".inst 0x64724393  // bfdot z19.s, z28.h, z2.h[2]\n"
+      ".inst 0x64734397  // bfdot z23.s, z28.h, z3.h[2]\n"
+      ".inst 0x6474439b  // bfdot z27.s, z28.h, z4.h[2]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x647843a8  // bfdot z8.s, z29.h, z0.h[3]\n"
+      ".inst 0x647943ac  // bfdot z12.s, z29.h, z1.h[3]\n"
+      ".inst 0x647a43b0  // bfdot z16.s, z29.h, z2.h[3]\n"
+      ".inst 0x647b43b4  // bfdot z20.s, z29.h, z3.h[3]\n"
+      ".inst 0x647c43b8  // bfdot z24.s, z29.h, z4.h[3]\n"
+      ".inst 0x64784389  // bfdot z9.s, z28.h, z0.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6479438d  // bfdot z13.s, z28.h, z1.h[3]\n"
+      ".inst 0x647a4391  // bfdot z17.s, z28.h, z2.h[3]\n"
+      ".inst 0x647b4395  // bfdot z21.s, z28.h, z3.h[3]\n"
+      ".inst 0x647c4399  // bfdot z25.s, z28.h, z4.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
+      ".inst 0x647843aa  // bfdot z10.s, z29.h, z0.h[3]\n"
+      ".inst 0x647943ae  // bfdot z14.s, z29.h, z1.h[3]\n"
+      ".inst 0x647a43b2  // bfdot z18.s, z29.h, z2.h[3]\n"
+      ".inst 0x647b43b6  // bfdot z22.s, z29.h, z3.h[3]\n"
+      ".inst 0x647c43ba  // bfdot z26.s, z29.h, z4.h[3]\n"
+      ".inst 0x6478438b  // bfdot z11.s, z28.h, z0.h[3]\n"
+      ".inst 0x6479438f  // bfdot z15.s, z28.h, z1.h[3]\n"
+      ".inst 0x647a4393  // bfdot z19.s, z28.h, z2.h[3]\n"
+      ".inst 0x647b4397  // bfdot z23.s, z28.h, z3.h[3]\n"
+      ".inst 0x647c439b  // bfdot z27.s, z28.h, z4.h[3]\n"
       "63:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1510,49 +1510,49 @@
       "add x22, x23, x20, LSL #2\n"
       "tbz %x[flags], #1, 64f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z29.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmin z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z1.s\n"
-      "fmin z22.s, p5/M, z22.s, z1.s\n"
-      "fmin z23.s, p5/M, z23.s, z1.s\n"
-      "fmin z24.s, p5/M, z24.s, z1.s\n"
-      "fmin z25.s, p5/M, z25.s, z1.s\n"
-      "fmin z26.s, p5/M, z26.s, z1.s\n"
-      "fmin z27.s, p5/M, z27.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
-      "fmax z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z0.s\n"
-      "fmax z22.s, p5/M, z22.s, z0.s\n"
-      "fmax z23.s, p5/M, z23.s, z0.s\n"
-      "fmax z24.s, p5/M, z24.s, z0.s\n"
-      "fmax z25.s, p5/M, z25.s, z0.s\n"
-      "fmax z26.s, p5/M, z26.s, z0.s\n"
-      "fmax z27.s, p5/M, z27.s, z0.s\n"
+      "ld1rw { z28.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z29.s\n"
+      "fmin z9.s, p5/M, z9.s, z29.s\n"
+      "fmin z10.s, p5/M, z10.s, z29.s\n"
+      "fmin z11.s, p5/M, z11.s, z29.s\n"
+      "fmin z12.s, p5/M, z12.s, z29.s\n"
+      "fmin z13.s, p5/M, z13.s, z29.s\n"
+      "fmin z14.s, p5/M, z14.s, z29.s\n"
+      "fmin z15.s, p5/M, z15.s, z29.s\n"
+      "fmin z16.s, p5/M, z16.s, z29.s\n"
+      "fmin z17.s, p5/M, z17.s, z29.s\n"
+      "fmin z18.s, p5/M, z18.s, z29.s\n"
+      "fmin z19.s, p5/M, z19.s, z29.s\n"
+      "fmin z20.s, p5/M, z20.s, z29.s\n"
+      "fmin z21.s, p5/M, z21.s, z29.s\n"
+      "fmin z22.s, p5/M, z22.s, z29.s\n"
+      "fmin z23.s, p5/M, z23.s, z29.s\n"
+      "fmin z24.s, p5/M, z24.s, z29.s\n"
+      "fmin z25.s, p5/M, z25.s, z29.s\n"
+      "fmin z26.s, p5/M, z26.s, z29.s\n"
+      "fmin z27.s, p5/M, z27.s, z29.s\n"
+      "fmax z8.s, p5/M, z8.s, z28.s\n"
+      "fmax z9.s, p5/M, z9.s, z28.s\n"
+      "fmax z10.s, p5/M, z10.s, z28.s\n"
+      "fmax z11.s, p5/M, z11.s, z28.s\n"
+      "fmax z12.s, p5/M, z12.s, z28.s\n"
+      "fmax z13.s, p5/M, z13.s, z28.s\n"
+      "fmax z14.s, p5/M, z14.s, z28.s\n"
+      "fmax z15.s, p5/M, z15.s, z28.s\n"
+      "fmax z16.s, p5/M, z16.s, z28.s\n"
+      "fmax z17.s, p5/M, z17.s, z28.s\n"
+      "fmax z18.s, p5/M, z18.s, z28.s\n"
+      "fmax z19.s, p5/M, z19.s, z28.s\n"
+      "fmax z20.s, p5/M, z20.s, z28.s\n"
+      "fmax z21.s, p5/M, z21.s, z28.s\n"
+      "fmax z22.s, p5/M, z22.s, z28.s\n"
+      "fmax z23.s, p5/M, z23.s, z28.s\n"
+      "fmax z24.s, p5/M, z24.s, z28.s\n"
+      "fmax z25.s, p5/M, z25.s, z28.s\n"
+      "fmax z26.s, p5/M, z26.s, z28.s\n"
+      "fmax z27.s, p5/M, z27.s, z28.s\n"
       "64:"  // Height 5: No activation
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -1627,35 +1627,35 @@
       "68:"  // Height 6: no bias
       "tbz %x[flags], #0, 69f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x24, x9, x20, LSL #2\n"
       "add x23, x24, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
       "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x24]\n"
-      "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z24.s }, p4/Z, [x22]\n"
-      "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p4/Z, [x21]\n"
-      "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x23]\n"
+      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x21]\n"
+      "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 70f\n"
       "69:"  // Height 6: no accumulate
       "mov z8.b, #0x0\n"
@@ -1687,16 +1687,16 @@
       "71:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 72f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 73f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1708,143 +1708,143 @@
       "b 73f\n"
       "72:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "73:"  // Height 6: input setup done
       "cmp x27, #0x8\n"
       "ble 75f\n"
       "74:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z7.h }, p0/Z, [x26]\n"
+      "ld1rqh { z6.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
-      "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z5.h }, p0/Z, [x24]\n"
+      "ld1rqh { z4.h }, p0/Z, [x23]\n"
       "cmp x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      "ld1rqh { z4.h }, p0/Z, [x22]\n"
-      "ld1rqh { z5.h }, p0/Z, [x21]\n"
+      "ld1rqh { z3.h }, p0/Z, [x22]\n"
+      "ld1rqh { z2.h }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x64674028  // bfdot z8.s, z1.h, z7.h[0]\n"
+      ".inst 0x6466402c  // bfdot z12.s, z1.h, z6.h[0]\n"
+      ".inst 0x64654030  // bfdot z16.s, z1.h, z5.h[0]\n"
+      ".inst 0x64644034  // bfdot z20.s, z1.h, z4.h[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
-      ".inst 0x646540dc  // bfdot z28.s, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x64634038  // bfdot z24.s, z1.h, z3.h[0]\n"
+      ".inst 0x6462403c  // bfdot z28.s, z1.h, z2.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
-      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
-      ".inst 0x646540fd  // bfdot z29.s, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
-      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
-      ".inst 0x646540de  // bfdot z30.s, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
-      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
-      ".inst 0x646540ff  // bfdot z31.s, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
-      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
-      ".inst 0x646d40dc  // bfdot z28.s, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
-      ".inst 0x646d40fd  // bfdot z29.s, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x64674009  // bfdot z9.s, z0.h, z7.h[0]\n"
+      ".inst 0x6466400d  // bfdot z13.s, z0.h, z6.h[0]\n"
+      ".inst 0x64654011  // bfdot z17.s, z0.h, z5.h[0]\n"
+      ".inst 0x64644015  // bfdot z21.s, z0.h, z4.h[0]\n"
+      ".inst 0x64634019  // bfdot z25.s, z0.h, z3.h[0]\n"
+      ".inst 0x6462401d  // bfdot z29.s, z0.h, z2.h[0]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467402a  // bfdot z10.s, z1.h, z7.h[0]\n"
+      ".inst 0x6466402e  // bfdot z14.s, z1.h, z6.h[0]\n"
+      ".inst 0x64654032  // bfdot z18.s, z1.h, z5.h[0]\n"
+      ".inst 0x64644036  // bfdot z22.s, z1.h, z4.h[0]\n"
+      ".inst 0x6463403a  // bfdot z26.s, z1.h, z3.h[0]\n"
+      ".inst 0x6462403e  // bfdot z30.s, z1.h, z2.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6467400b  // bfdot z11.s, z0.h, z7.h[0]\n"
+      ".inst 0x6466400f  // bfdot z15.s, z0.h, z6.h[0]\n"
+      ".inst 0x64654013  // bfdot z19.s, z0.h, z5.h[0]\n"
+      ".inst 0x64644017  // bfdot z23.s, z0.h, z4.h[0]\n"
+      ".inst 0x6463401b  // bfdot z27.s, z0.h, z3.h[0]\n"
+      ".inst 0x6462401f  // bfdot z31.s, z0.h, z2.h[0]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x646f4028  // bfdot z8.s, z1.h, z7.h[1]\n"
+      ".inst 0x646e402c  // bfdot z12.s, z1.h, z6.h[1]\n"
+      ".inst 0x646d4030  // bfdot z16.s, z1.h, z5.h[1]\n"
+      ".inst 0x646c4034  // bfdot z20.s, z1.h, z4.h[1]\n"
+      ".inst 0x646b4038  // bfdot z24.s, z1.h, z3.h[1]\n"
+      ".inst 0x646a403c  // bfdot z28.s, z1.h, z2.h[1]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x646f4009  // bfdot z9.s, z0.h, z7.h[1]\n"
+      ".inst 0x646e400d  // bfdot z13.s, z0.h, z6.h[1]\n"
+      ".inst 0x646d4011  // bfdot z17.s, z0.h, z5.h[1]\n"
+      ".inst 0x646c4015  // bfdot z21.s, z0.h, z4.h[1]\n"
+      ".inst 0x646b4019  // bfdot z25.s, z0.h, z3.h[1]\n"
+      ".inst 0x646a401d  // bfdot z29.s, z0.h, z2.h[1]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
-      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
-      ".inst 0x646d40de  // bfdot z30.s, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
-      ".inst 0x646d40ff  // bfdot z31.s, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
-      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
-      ".inst 0x647540dc  // bfdot z28.s, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
-      ".inst 0x647540fd  // bfdot z29.s, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
-      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
-      ".inst 0x647540de  // bfdot z30.s, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
-      ".inst 0x647540ff  // bfdot z31.s, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
-      ".inst 0x647d40dc  // bfdot z28.s, z6.h, z5.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
-      ".inst 0x647d40fd  // bfdot z29.s, z7.h, z5.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
-      ".inst 0x647d40de  // bfdot z30.s, z6.h, z5.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
-      ".inst 0x647d40ff  // bfdot z31.s, z7.h, z5.h[3]\n"
+      ".inst 0x646f402a  // bfdot z10.s, z1.h, z7.h[1]\n"
+      ".inst 0x646e402e  // bfdot z14.s, z1.h, z6.h[1]\n"
+      ".inst 0x646d4032  // bfdot z18.s, z1.h, z5.h[1]\n"
+      ".inst 0x646c4036  // bfdot z22.s, z1.h, z4.h[1]\n"
+      ".inst 0x646b403a  // bfdot z26.s, z1.h, z3.h[1]\n"
+      ".inst 0x646a403e  // bfdot z30.s, z1.h, z2.h[1]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x646f400b  // bfdot z11.s, z0.h, z7.h[1]\n"
+      ".inst 0x646e400f  // bfdot z15.s, z0.h, z6.h[1]\n"
+      ".inst 0x646d4013  // bfdot z19.s, z0.h, z5.h[1]\n"
+      ".inst 0x646c4017  // bfdot z23.s, z0.h, z4.h[1]\n"
+      ".inst 0x646b401b  // bfdot z27.s, z0.h, z3.h[1]\n"
+      ".inst 0x646a401f  // bfdot z31.s, z0.h, z2.h[1]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x64774028  // bfdot z8.s, z1.h, z7.h[2]\n"
+      ".inst 0x6476402c  // bfdot z12.s, z1.h, z6.h[2]\n"
+      ".inst 0x64754030  // bfdot z16.s, z1.h, z5.h[2]\n"
+      ".inst 0x64744034  // bfdot z20.s, z1.h, z4.h[2]\n"
+      ".inst 0x64734038  // bfdot z24.s, z1.h, z3.h[2]\n"
+      ".inst 0x6472403c  // bfdot z28.s, z1.h, z2.h[2]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x64774009  // bfdot z9.s, z0.h, z7.h[2]\n"
+      ".inst 0x6476400d  // bfdot z13.s, z0.h, z6.h[2]\n"
+      ".inst 0x64754011  // bfdot z17.s, z0.h, z5.h[2]\n"
+      ".inst 0x64744015  // bfdot z21.s, z0.h, z4.h[2]\n"
+      ".inst 0x64734019  // bfdot z25.s, z0.h, z3.h[2]\n"
+      ".inst 0x6472401d  // bfdot z29.s, z0.h, z2.h[2]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6477402a  // bfdot z10.s, z1.h, z7.h[2]\n"
+      ".inst 0x6476402e  // bfdot z14.s, z1.h, z6.h[2]\n"
+      ".inst 0x64754032  // bfdot z18.s, z1.h, z5.h[2]\n"
+      ".inst 0x64744036  // bfdot z22.s, z1.h, z4.h[2]\n"
+      ".inst 0x6473403a  // bfdot z26.s, z1.h, z3.h[2]\n"
+      ".inst 0x6472403e  // bfdot z30.s, z1.h, z2.h[2]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6477400b  // bfdot z11.s, z0.h, z7.h[2]\n"
+      ".inst 0x6476400f  // bfdot z15.s, z0.h, z6.h[2]\n"
+      ".inst 0x64754013  // bfdot z19.s, z0.h, z5.h[2]\n"
+      ".inst 0x64744017  // bfdot z23.s, z0.h, z4.h[2]\n"
+      ".inst 0x6473401b  // bfdot z27.s, z0.h, z3.h[2]\n"
+      ".inst 0x6472401f  // bfdot z31.s, z0.h, z2.h[2]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x647f4028  // bfdot z8.s, z1.h, z7.h[3]\n"
+      ".inst 0x647e402c  // bfdot z12.s, z1.h, z6.h[3]\n"
+      ".inst 0x647d4030  // bfdot z16.s, z1.h, z5.h[3]\n"
+      ".inst 0x647c4034  // bfdot z20.s, z1.h, z4.h[3]\n"
+      ".inst 0x647b4038  // bfdot z24.s, z1.h, z3.h[3]\n"
+      ".inst 0x647a403c  // bfdot z28.s, z1.h, z2.h[3]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x647f4009  // bfdot z9.s, z0.h, z7.h[3]\n"
+      ".inst 0x647e400d  // bfdot z13.s, z0.h, z6.h[3]\n"
+      ".inst 0x647d4011  // bfdot z17.s, z0.h, z5.h[3]\n"
+      ".inst 0x647c4015  // bfdot z21.s, z0.h, z4.h[3]\n"
+      ".inst 0x647b4019  // bfdot z25.s, z0.h, z3.h[3]\n"
+      ".inst 0x647a401d  // bfdot z29.s, z0.h, z2.h[3]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x647f402a  // bfdot z10.s, z1.h, z7.h[3]\n"
+      ".inst 0x647e402e  // bfdot z14.s, z1.h, z6.h[3]\n"
+      ".inst 0x647d4032  // bfdot z18.s, z1.h, z5.h[3]\n"
+      ".inst 0x647c4036  // bfdot z22.s, z1.h, z4.h[3]\n"
+      ".inst 0x647b403a  // bfdot z26.s, z1.h, z3.h[3]\n"
+      ".inst 0x647a403e  // bfdot z30.s, z1.h, z2.h[3]\n"
+      ".inst 0x647f400b  // bfdot z11.s, z0.h, z7.h[3]\n"
+      ".inst 0x647e400f  // bfdot z15.s, z0.h, z6.h[3]\n"
+      ".inst 0x647d4013  // bfdot z19.s, z0.h, z5.h[3]\n"
+      ".inst 0x647c4017  // bfdot z23.s, z0.h, z4.h[3]\n"
+      ".inst 0x647b401b  // bfdot z27.s, z0.h, z3.h[3]\n"
+      ".inst 0x647a401f  // bfdot z31.s, z0.h, z2.h[3]\n"
       "bgt 74b\n"
       "75:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
@@ -1855,127 +1855,127 @@
       "ld1rqh { z3.h }, p0/Z, [x23]\n"
       "ld1rqh { z4.h }, p0/Z, [x22]\n"
       "ld1rqh { z5.h }, p0/Z, [x21]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
-      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
-      ".inst 0x646540dc  // bfdot z28.s, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
-      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
-      ".inst 0x646540fd  // bfdot z29.s, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646040e8  // bfdot z8.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ec  // bfdot z12.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f0  // bfdot z16.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f4  // bfdot z20.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440f8  // bfdot z24.s, z7.h, z4.h[0]\n"
+      ".inst 0x646540fc  // bfdot z28.s, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x646040c9  // bfdot z9.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140cd  // bfdot z13.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d1  // bfdot z17.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d5  // bfdot z21.s, z6.h, z3.h[0]\n"
+      ".inst 0x646440d9  // bfdot z25.s, z6.h, z4.h[0]\n"
+      ".inst 0x646540dd  // bfdot z29.s, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
-      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
-      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
-      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
-      ".inst 0x646540de  // bfdot z30.s, z6.h, z5.h[0]\n"
-      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
-      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
-      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
-      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
-      ".inst 0x646540ff  // bfdot z31.s, z7.h, z5.h[0]\n"
+      ".inst 0x646040ea  // bfdot z10.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ee  // bfdot z14.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f2  // bfdot z18.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f6  // bfdot z22.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440fa  // bfdot z26.s, z7.h, z4.h[0]\n"
+      ".inst 0x646540fe  // bfdot z30.s, z7.h, z5.h[0]\n"
+      ".inst 0x646040cb  // bfdot z11.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140cf  // bfdot z15.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d3  // bfdot z19.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d7  // bfdot z23.s, z6.h, z3.h[0]\n"
+      ".inst 0x646440db  // bfdot z27.s, z6.h, z4.h[0]\n"
+      ".inst 0x646540df  // bfdot z31.s, z6.h, z5.h[0]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x646840e8  // bfdot z8.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ec  // bfdot z12.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f0  // bfdot z16.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f4  // bfdot z20.s, z7.h, z3.h[1]\n"
       "subs x27, x27, #0x2\n"
-      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
-      ".inst 0x646d40dc  // bfdot z28.s, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
-      ".inst 0x646d40fd  // bfdot z29.s, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x646c40f8  // bfdot z24.s, z7.h, z4.h[1]\n"
+      ".inst 0x646d40fc  // bfdot z28.s, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x646840c9  // bfdot z9.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940cd  // bfdot z13.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d1  // bfdot z17.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d5  // bfdot z21.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40d9  // bfdot z25.s, z6.h, z4.h[1]\n"
+      ".inst 0x646d40dd  // bfdot z29.s, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
-      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
-      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
-      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
-      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
-      ".inst 0x646d40de  // bfdot z30.s, z6.h, z5.h[1]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
-      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
-      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
-      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
-      ".inst 0x646d40ff  // bfdot z31.s, z7.h, z5.h[1]\n"
+      ".inst 0x646840ea  // bfdot z10.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ee  // bfdot z14.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f2  // bfdot z18.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f6  // bfdot z22.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40fa  // bfdot z26.s, z7.h, z4.h[1]\n"
+      ".inst 0x646d40fe  // bfdot z30.s, z7.h, z5.h[1]\n"
+      ".inst 0x646840cb  // bfdot z11.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940cf  // bfdot z15.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d3  // bfdot z19.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d7  // bfdot z23.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40db  // bfdot z27.s, z6.h, z4.h[1]\n"
+      ".inst 0x646d40df  // bfdot z31.s, z6.h, z5.h[1]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x647040e8  // bfdot z8.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ec  // bfdot z12.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f0  // bfdot z16.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f4  // bfdot z20.s, z7.h, z3.h[2]\n"
       "subs x27, x27, #0x2\n"
-      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
-      ".inst 0x647540dc  // bfdot z28.s, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
-      ".inst 0x647540fd  // bfdot z29.s, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x647440f8  // bfdot z24.s, z7.h, z4.h[2]\n"
+      ".inst 0x647540fc  // bfdot z28.s, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x647040c9  // bfdot z9.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140cd  // bfdot z13.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d1  // bfdot z17.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d5  // bfdot z21.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440d9  // bfdot z25.s, z6.h, z4.h[2]\n"
+      ".inst 0x647540dd  // bfdot z29.s, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
-      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
-      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
-      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
-      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
-      ".inst 0x647540de  // bfdot z30.s, z6.h, z5.h[2]\n"
-      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
-      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
-      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
-      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
-      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
-      ".inst 0x647540ff  // bfdot z31.s, z7.h, z5.h[2]\n"
+      ".inst 0x647040ea  // bfdot z10.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ee  // bfdot z14.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f2  // bfdot z18.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f6  // bfdot z22.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440fa  // bfdot z26.s, z7.h, z4.h[2]\n"
+      ".inst 0x647540fe  // bfdot z30.s, z7.h, z5.h[2]\n"
+      ".inst 0x647040cb  // bfdot z11.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140cf  // bfdot z15.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d3  // bfdot z19.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d7  // bfdot z23.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440db  // bfdot z27.s, z6.h, z4.h[2]\n"
+      ".inst 0x647540df  // bfdot z31.s, z6.h, z5.h[2]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
-      ".inst 0x647d40dc  // bfdot z28.s, z6.h, z5.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
-      ".inst 0x647d40fd  // bfdot z29.s, z7.h, z5.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x647840e8  // bfdot z8.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ec  // bfdot z12.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f0  // bfdot z16.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f4  // bfdot z20.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40f8  // bfdot z24.s, z7.h, z4.h[3]\n"
+      ".inst 0x647d40fc  // bfdot z28.s, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x647840c9  // bfdot z9.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940cd  // bfdot z13.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d1  // bfdot z17.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d5  // bfdot z21.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40d9  // bfdot z25.s, z6.h, z4.h[3]\n"
+      ".inst 0x647d40dd  // bfdot z29.s, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
-      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
-      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
-      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
-      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
-      ".inst 0x647d40de  // bfdot z30.s, z6.h, z5.h[3]\n"
-      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
-      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
-      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
-      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
-      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
-      ".inst 0x647d40ff  // bfdot z31.s, z7.h, z5.h[3]\n"
+      ".inst 0x647840ea  // bfdot z10.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ee  // bfdot z14.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f2  // bfdot z18.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f6  // bfdot z22.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40fa  // bfdot z26.s, z7.h, z4.h[3]\n"
+      ".inst 0x647d40fe  // bfdot z30.s, z7.h, z5.h[3]\n"
+      ".inst 0x647840cb  // bfdot z11.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940cf  // bfdot z15.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d3  // bfdot z19.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d7  // bfdot z23.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40db  // bfdot z27.s, z6.h, z4.h[3]\n"
+      ".inst 0x647d40df  // bfdot z31.s, z6.h, z5.h[3]\n"
       "76:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -2082,7 +2082,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "80:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -2090,4 +2089,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp
index b8d237f..223d8a7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../bfloat.hpp"
 #include "../performance_parameters.hpp"
@@ -75,7 +75,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, bfloat16>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -100,5 +99,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp
index 9bb67f1..74e2d26 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp
@@ -133,16 +133,16 @@
       "b 5f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 4f\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x9]\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "zip1 z8.d, z16.d, z12.d\n"
+      "zip2 z12.d, z16.d, z12.d\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
       "b 5f\n"
@@ -160,11 +160,11 @@
       "6:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 7f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 8f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -176,86 +176,86 @@
       "ble 10f\n"
       "9:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1rqh { z20.h }, p0/Z, [x26]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6471e688  // bfmmla z8.s, z20.h, z17.h\n"
+      ".inst 0x6470e68c  // bfmmla z12.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6471e689  // bfmmla z9.s, z20.h, z17.h\n"
+      ".inst 0x6470e68d  // bfmmla z13.s, z20.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6470e68a  // bfmmla z10.s, z20.h, z16.h\n"
+      ".inst 0x6471e68e  // bfmmla z14.s, z20.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
       "sub x27, x27, #0x8\n"
       "cmp x27, #0x8\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6471e68b  // bfmmla z11.s, z20.h, z17.h\n"
+      ".inst 0x6470e68f  // bfmmla z15.s, z20.h, z16.h\n"
       "add x26, x26, #0x10\n"
       "bgt 9b\n"
       "10:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
       "addvl x10, x10, #8\n"
       "ble 11f\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e428  // bfmmla z8.s, z1.h, z17.h\n"
+      ".inst 0x6470e42c  // bfmmla z12.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e429  // bfmmla z9.s, z1.h, z17.h\n"
+      ".inst 0x6470e42d  // bfmmla z13.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e42a  // bfmmla z10.s, z1.h, z17.h\n"
+      ".inst 0x6470e42e  // bfmmla z14.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6471e42b  // bfmmla z11.s, z1.h, z17.h\n"
+      ".inst 0x6470e42f  // bfmmla z15.s, z1.h, z16.h\n"
       "addvl x10, x10, #8\n"
       "11:"  // Height 1: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -268,17 +268,17 @@
       "uzp1 z11.d, z11.d, z15.d\n"
       "tbz %x[flags], #1, 12f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z21.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z21.s\n"
+      "fmin z9.s, p5/M, z9.s, z21.s\n"
+      "fmin z10.s, p5/M, z10.s, z21.s\n"
+      "fmin z11.s, p5/M, z11.s, z21.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
       "12:"  // Height 1: No activation
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -322,21 +322,21 @@
       "16:"  // Height 2: no bias
       "tbz %x[flags], #0, 17f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
       "b 18f\n"
@@ -354,12 +354,12 @@
       "19:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 21f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -367,95 +367,95 @@
       "b 21f\n"
       "20:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
       "21:"  // Height 2: input setup done
       "cmp x27, #0x8\n"
       "ble 23f\n"
       "22:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1rqh { z20.h }, p0/Z, [x26]\n"
+      "ld1rqh { z19.h }, p0/Z, [x25]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6471e688  // bfmmla z8.s, z20.h, z17.h\n"
+      ".inst 0x6470e68c  // bfmmla z12.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6471e689  // bfmmla z9.s, z20.h, z17.h\n"
+      ".inst 0x6470e68d  // bfmmla z13.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6471e68a  // bfmmla z10.s, z20.h, z17.h\n"
+      ".inst 0x6470e68e  // bfmmla z14.s, z20.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
       "sub x27, x27, #0x8\n"
       "cmp x27, #0x8\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6471e68b  // bfmmla z11.s, z20.h, z17.h\n"
+      ".inst 0x6470e68f  // bfmmla z15.s, z20.h, z16.h\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "bgt 22b\n"
       "23:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1rqh { z19.h }, p0/Z, [x25]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
       "addvl x10, x10, #8\n"
       "ble 24f\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e428  // bfmmla z8.s, z1.h, z17.h\n"
+      ".inst 0x6470e42c  // bfmmla z12.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e429  // bfmmla z9.s, z1.h, z17.h\n"
+      ".inst 0x6470e42d  // bfmmla z13.s, z1.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e42a  // bfmmla z10.s, z1.h, z17.h\n"
+      ".inst 0x6470e42e  // bfmmla z14.s, z1.h, z16.h\n"
+      "ld1h { z22.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6476e42b  // bfmmla z11.s, z1.h, z22.h\n"
+      ".inst 0x6470e42f  // bfmmla z15.s, z1.h, z16.h\n"
       "addvl x10, x10, #8\n"
       "24:"  // Height 2: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -474,25 +474,25 @@
       "uzp2 z11.d, z11.d, z15.d\n"
       "tbz %x[flags], #1, 25f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z7.s, p5/M, z7.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z7.s, p5/M, z7.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z17.s\n"
+      "fmin z12.s, p5/M, z12.s, z17.s\n"
+      "fmin z13.s, p5/M, z13.s, z17.s\n"
+      "fmin z14.s, p5/M, z14.s, z17.s\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmax z7.s, p5/M, z7.s, z16.s\n"
+      "fmax z12.s, p5/M, z12.s, z16.s\n"
+      "fmax z13.s, p5/M, z13.s, z16.s\n"
+      "fmax z14.s, p5/M, z14.s, z16.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
       "25:"  // Height 2: No activation
       "st1w { z7.s }, p4, [x9]\n"
       "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -548,28 +548,28 @@
       "29:"  // Height 3: no bias
       "tbz %x[flags], #0, 30f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x24]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x20]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
       "zip1 z17.d, z18.d, z21.d\n"
@@ -601,13 +601,13 @@
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 34f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -616,136 +616,136 @@
       "b 34f\n"
       "33:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "34:"  // Height 3: input setup done
       "cmp x27, #0x8\n"
       "ble 36f\n"
       "35:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "ld1rqh { z30.h }, p0/Z, [x26]\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
+      "ld1rqh { z28.h }, p0/Z, [x24]\n"
+      "trn1 z27.d, z30.d, z24.d\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "trn1 z26.d, z28.d, z29.d\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6479e768  // bfmmla z8.s, z27.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e76c  // bfmmla z12.s, z27.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e769  // bfmmla z9.s, z27.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z29.d\n"
+      ".inst 0x6478e76d  // bfmmla z13.s, z27.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6479e76a  // bfmmla z10.s, z27.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
       "cmp x27, #0x8\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6478e76e  // bfmmla z14.s, z27.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6479e76b  // bfmmla z11.s, z27.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6478e76f  // bfmmla z15.s, z27.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6479e7c8  // bfmmla z8.s, z30.h, z25.h\n"
+      ".inst 0x6479e790  // bfmmla z16.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      ".inst 0x6478e7cc  // bfmmla z12.s, z30.h, z24.h\n"
+      ".inst 0x6478e794  // bfmmla z20.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6479e7c9  // bfmmla z9.s, z30.h, z25.h\n"
+      ".inst 0x6479e791  // bfmmla z17.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6478e7cd  // bfmmla z13.s, z30.h, z24.h\n"
+      ".inst 0x6478e795  // bfmmla z21.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6479e7ca  // bfmmla z10.s, z30.h, z25.h\n"
+      ".inst 0x6479e792  // bfmmla z18.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6478e7ce  // bfmmla z14.s, z30.h, z24.h\n"
+      ".inst 0x6478e796  // bfmmla z22.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6479e7cb  // bfmmla z11.s, z30.h, z25.h\n"
+      ".inst 0x6479e793  // bfmmla z19.s, z28.h, z25.h\n"
+      ".inst 0x6478e7cf  // bfmmla z15.s, z30.h, z24.h\n"
+      ".inst 0x6478e797  // bfmmla z23.s, z28.h, z24.h\n"
       "bgt 35b\n"
       "36:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
       "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn1 z27.d, z1.d, z24.d\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "trn1 z26.d, z3.d, z28.d\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6479e768  // bfmmla z8.s, z27.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e76c  // bfmmla z12.s, z27.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e769  // bfmmla z9.s, z27.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6478e76d  // bfmmla z13.s, z27.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z28.d\n"
+      ".inst 0x6479e76a  // bfmmla z10.s, z27.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e76e  // bfmmla z14.s, z27.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      ".inst 0x6479e76b  // bfmmla z11.s, z27.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      ".inst 0x6478e76f  // bfmmla z15.s, z27.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
       "ble 37f\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6479e428  // bfmmla z8.s, z1.h, z25.h\n"
+      ".inst 0x6479e470  // bfmmla z16.s, z3.h, z25.h\n"
+      ".inst 0x6478e42c  // bfmmla z12.s, z1.h, z24.h\n"
+      ".inst 0x6478e474  // bfmmla z20.s, z3.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e429  // bfmmla z9.s, z1.h, z25.h\n"
+      ".inst 0x6479e471  // bfmmla z17.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6478e42d  // bfmmla z13.s, z1.h, z24.h\n"
+      ".inst 0x6478e475  // bfmmla z21.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6479e42a  // bfmmla z10.s, z1.h, z25.h\n"
+      ".inst 0x6479e472  // bfmmla z18.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e42e  // bfmmla z14.s, z1.h, z24.h\n"
+      ".inst 0x6478e476  // bfmmla z22.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      ".inst 0x6479e42b  // bfmmla z11.s, z1.h, z25.h\n"
+      ".inst 0x6479e473  // bfmmla z19.s, z3.h, z25.h\n"
+      ".inst 0x6478e42f  // bfmmla z15.s, z1.h, z24.h\n"
+      ".inst 0x6478e477  // bfmmla z23.s, z3.h, z24.h\n"
       "37:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -768,33 +768,33 @@
       "uzp1 z19.d, z19.d, z23.d\n"
       "tbz %x[flags], #1, 38f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z25.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z7.s, p5/M, z7.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z7.s, p5/M, z7.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z25.s\n"
+      "fmin z12.s, p5/M, z12.s, z25.s\n"
+      "fmin z13.s, p5/M, z13.s, z25.s\n"
+      "fmin z14.s, p5/M, z14.s, z25.s\n"
+      "fmin z8.s, p5/M, z8.s, z25.s\n"
+      "fmin z9.s, p5/M, z9.s, z25.s\n"
+      "fmin z10.s, p5/M, z10.s, z25.s\n"
+      "fmin z11.s, p5/M, z11.s, z25.s\n"
+      "fmin z16.s, p5/M, z16.s, z25.s\n"
+      "fmin z17.s, p5/M, z17.s, z25.s\n"
+      "fmin z18.s, p5/M, z18.s, z25.s\n"
+      "fmin z19.s, p5/M, z19.s, z25.s\n"
+      "fmax z7.s, p5/M, z7.s, z24.s\n"
+      "fmax z12.s, p5/M, z12.s, z24.s\n"
+      "fmax z13.s, p5/M, z13.s, z24.s\n"
+      "fmax z14.s, p5/M, z14.s, z24.s\n"
+      "fmax z8.s, p5/M, z8.s, z24.s\n"
+      "fmax z9.s, p5/M, z9.s, z24.s\n"
+      "fmax z10.s, p5/M, z10.s, z24.s\n"
+      "fmax z11.s, p5/M, z11.s, z24.s\n"
+      "fmax z16.s, p5/M, z16.s, z24.s\n"
+      "fmax z17.s, p5/M, z17.s, z24.s\n"
+      "fmax z18.s, p5/M, z18.s, z24.s\n"
+      "fmax z19.s, p5/M, z19.s, z24.s\n"
       "38:"  // Height 3: No activation
       "st1w { z7.s }, p4, [x9]\n"
       "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -854,37 +854,37 @@
       "42:"  // Height 4: no bias
       "tbz %x[flags], #0, 43f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x24]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x21]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
+      "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z17.d, z18.d, z21.d\n"
       "zip2 z21.d, z18.d, z21.d\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z18.d, z19.d, z22.d\n"
       "zip2 z22.d, z19.d, z22.d\n"
       "zip1 z19.d, z24.d, z23.d\n"
@@ -912,14 +912,14 @@
       "45:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 46f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -929,140 +929,140 @@
       "b 47f\n"
       "46:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "47:"  // Height 4: input setup done
       "cmp x27, #0x8\n"
       "ble 49f\n"
       "48:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "ld1rqh { z4.h }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "ld1rqh { z30.h }, p0/Z, [x26]\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
+      "trn1 z29.d, z30.d, z24.d\n"
+      "ld1rqh { z28.h }, p0/Z, [x24]\n"
+      "ld1rqh { z27.h }, p0/Z, [x23]\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "trn1 z26.d, z28.d, z27.d\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6479e7a8  // bfmmla z8.s, z29.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e7ac  // bfmmla z12.s, z29.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e7a9  // bfmmla z9.s, z29.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z27.d\n"
+      ".inst 0x6478e7ad  // bfmmla z13.s, z29.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6479e7aa  // bfmmla z10.s, z29.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
       "cmp x27, #0x8\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6478e7ae  // bfmmla z14.s, z29.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6479e7ab  // bfmmla z11.s, z29.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6478e7af  // bfmmla z15.s, z29.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6479e7c8  // bfmmla z8.s, z30.h, z25.h\n"
+      ".inst 0x6479e790  // bfmmla z16.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6478e7cc  // bfmmla z12.s, z30.h, z24.h\n"
+      ".inst 0x6478e794  // bfmmla z20.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      ".inst 0x6479e7c9  // bfmmla z9.s, z30.h, z25.h\n"
+      ".inst 0x6479e791  // bfmmla z17.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6478e7cd  // bfmmla z13.s, z30.h, z24.h\n"
+      ".inst 0x6478e795  // bfmmla z21.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6479e7ca  // bfmmla z10.s, z30.h, z25.h\n"
+      ".inst 0x6479e792  // bfmmla z18.s, z28.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6478e7ce  // bfmmla z14.s, z30.h, z24.h\n"
+      ".inst 0x6478e796  // bfmmla z22.s, z28.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6479e7cb  // bfmmla z11.s, z30.h, z25.h\n"
+      ".inst 0x6479e793  // bfmmla z19.s, z28.h, z25.h\n"
+      ".inst 0x6478e7cf  // bfmmla z15.s, z30.h, z24.h\n"
+      ".inst 0x6478e797  // bfmmla z23.s, z28.h, z24.h\n"
       "bgt 48b\n"
       "49:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqh { z24.h }, p0/Z, [x25]\n"
+      "trn1 z28.d, z1.d, z24.d\n"
       "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "ld1rqh { z4.h }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1rqh { z27.h }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "trn1 z26.d, z3.d, z27.d\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6479e788  // bfmmla z8.s, z28.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e78c  // bfmmla z12.s, z28.h, z24.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e789  // bfmmla z9.s, z28.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6478e78d  // bfmmla z13.s, z28.h, z24.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z27.d\n"
+      ".inst 0x6479e78a  // bfmmla z10.s, z28.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e78e  // bfmmla z14.s, z28.h, z24.h\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      ".inst 0x6479e78b  // bfmmla z11.s, z28.h, z25.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      ".inst 0x6478e78f  // bfmmla z15.s, z28.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
       "ble 50f\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6479e428  // bfmmla z8.s, z1.h, z25.h\n"
+      ".inst 0x6479e470  // bfmmla z16.s, z3.h, z25.h\n"
+      ".inst 0x6478e42c  // bfmmla z12.s, z1.h, z24.h\n"
+      ".inst 0x6478e474  // bfmmla z20.s, z3.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e429  // bfmmla z9.s, z1.h, z25.h\n"
+      ".inst 0x6479e471  // bfmmla z17.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6478e42d  // bfmmla z13.s, z1.h, z24.h\n"
+      ".inst 0x6478e475  // bfmmla z21.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6479e42a  // bfmmla z10.s, z1.h, z25.h\n"
+      ".inst 0x6479e472  // bfmmla z18.s, z3.h, z25.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e42e  // bfmmla z14.s, z1.h, z24.h\n"
+      ".inst 0x6478e476  // bfmmla z22.s, z3.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      ".inst 0x6479e42b  // bfmmla z11.s, z1.h, z25.h\n"
+      ".inst 0x6479e473  // bfmmla z19.s, z3.h, z25.h\n"
+      ".inst 0x6478e42f  // bfmmla z15.s, z1.h, z24.h\n"
+      ".inst 0x6478e477  // bfmmla z23.s, z3.h, z24.h\n"
       "50:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1090,41 +1090,41 @@
       "uzp2 z19.d, z19.d, z23.d\n"
       "tbz %x[flags], #1, 51f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z7.s, p5/M, z7.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z1.s\n"
-      "fmin z22.s, p5/M, z22.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z7.s, p5/M, z7.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z0.s\n"
-      "fmax z22.s, p5/M, z22.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "ld1rw { z23.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z24.s\n"
+      "fmin z12.s, p5/M, z12.s, z24.s\n"
+      "fmin z13.s, p5/M, z13.s, z24.s\n"
+      "fmin z14.s, p5/M, z14.s, z24.s\n"
+      "fmin z8.s, p5/M, z8.s, z24.s\n"
+      "fmin z9.s, p5/M, z9.s, z24.s\n"
+      "fmin z10.s, p5/M, z10.s, z24.s\n"
+      "fmin z11.s, p5/M, z11.s, z24.s\n"
+      "fmin z15.s, p5/M, z15.s, z24.s\n"
+      "fmin z20.s, p5/M, z20.s, z24.s\n"
+      "fmin z21.s, p5/M, z21.s, z24.s\n"
+      "fmin z22.s, p5/M, z22.s, z24.s\n"
+      "fmin z16.s, p5/M, z16.s, z24.s\n"
+      "fmin z17.s, p5/M, z17.s, z24.s\n"
+      "fmin z18.s, p5/M, z18.s, z24.s\n"
+      "fmin z19.s, p5/M, z19.s, z24.s\n"
+      "fmax z7.s, p5/M, z7.s, z23.s\n"
+      "fmax z12.s, p5/M, z12.s, z23.s\n"
+      "fmax z13.s, p5/M, z13.s, z23.s\n"
+      "fmax z14.s, p5/M, z14.s, z23.s\n"
+      "fmax z8.s, p5/M, z8.s, z23.s\n"
+      "fmax z9.s, p5/M, z9.s, z23.s\n"
+      "fmax z10.s, p5/M, z10.s, z23.s\n"
+      "fmax z11.s, p5/M, z11.s, z23.s\n"
+      "fmax z15.s, p5/M, z15.s, z23.s\n"
+      "fmax z20.s, p5/M, z20.s, z23.s\n"
+      "fmax z21.s, p5/M, z21.s, z23.s\n"
+      "fmax z22.s, p5/M, z22.s, z23.s\n"
+      "fmax z16.s, p5/M, z16.s, z23.s\n"
+      "fmax z17.s, p5/M, z17.s, z23.s\n"
+      "fmax z18.s, p5/M, z18.s, z23.s\n"
+      "fmax z19.s, p5/M, z19.s, z23.s\n"
       "51:"  // Height 4: No activation
       "st1w { z7.s }, p4, [x9]\n"
       "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -1196,54 +1196,54 @@
       "55:"  // Height 5: no bias
       "tbz %x[flags], #0, 56f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x24]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip1 z17.d, z18.d, z21.d\n"
       "zip2 z21.d, z18.d, z21.d\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z25.s }, p4/Z, [x22]\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x20]\n"
       "zip1 z18.d, z19.d, z22.d\n"
       "zip2 z22.d, z19.d, z22.d\n"
-      "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z19.d, z24.d, z23.d\n"
       "zip2 z23.d, z24.d, z23.d\n"
-      "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z24.d, z25.d, z28.d\n"
       "zip2 z28.d, z25.d, z28.d\n"
       "zip1 z25.d, z26.d, z29.d\n"
       "zip2 z29.d, z26.d, z29.d\n"
       "zip1 z26.d, z27.d, z30.d\n"
       "zip2 z30.d, z27.d, z30.d\n"
-      "zip1 z27.d, z6.d, z31.d\n"
-      "zip2 z31.d, z6.d, z31.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
       "b 57f\n"
       "56:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -1275,15 +1275,15 @@
       "58:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 59f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 60f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1294,180 +1294,180 @@
       "b 60f\n"
       "59:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "60:"  // Height 5: input setup done
       "cmp x27, #0x8\n"
       "ble 62f\n"
       "61:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "ld1rqh { z4.h }, p0/Z, [x23]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1rqh { z5.h }, p0/Z, [x22]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6467e498  // bfmmla z24.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1rqh { z6.h }, p0/Z, [x26]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z7.h }, p0/Z, [x24]\n"
+      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "trn1 z5.d, z6.d, z1.d\n"
+      "trn2 z6.d, z6.d, z1.d\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "trn1 z3.d, z7.d, z2.d\n"
+      "trn2 z7.d, z7.d, z2.d\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "trn1 z2.d, z4.d, z0.d\n"
+      "trn2 z4.d, z4.d, z0.d\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6461e4a8  // bfmmla z8.s, z5.h, z1.h\n"
+      ".inst 0x6461e470  // bfmmla z16.s, z3.h, z1.h\n"
+      ".inst 0x6461e458  // bfmmla z24.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      ".inst 0x6460e4ac  // bfmmla z12.s, z5.h, z0.h\n"
+      ".inst 0x6460e474  // bfmmla z20.s, z3.h, z0.h\n"
       "cmp x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6466e49c  // bfmmla z28.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6460e45c  // bfmmla z28.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461e4a9  // bfmmla z9.s, z5.h, z1.h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      ".inst 0x6467e499  // bfmmla z25.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6461e471  // bfmmla z17.s, z3.h, z1.h\n"
+      ".inst 0x6461e459  // bfmmla z25.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6460e4ad  // bfmmla z13.s, z5.h, z0.h\n"
+      ".inst 0x6460e475  // bfmmla z21.s, z3.h, z0.h\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x6466e49d  // bfmmla z29.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      ".inst 0x6467e49a  // bfmmla z26.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      ".inst 0x6466e49e  // bfmmla z30.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6460e45d  // bfmmla z29.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6461e4aa  // bfmmla z10.s, z5.h, z1.h\n"
+      ".inst 0x6461e472  // bfmmla z18.s, z3.h, z1.h\n"
+      ".inst 0x6461e45a  // bfmmla z26.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4ae  // bfmmla z14.s, z5.h, z0.h\n"
+      ".inst 0x6460e476  // bfmmla z22.s, z3.h, z0.h\n"
+      ".inst 0x6460e45e  // bfmmla z30.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
-      ".inst 0x6467e49b  // bfmmla z27.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6466e49f  // bfmmla z31.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b8  // bfmmla z24.s, z5.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bc  // bfmmla z28.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bd  // bfmmla z29.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      ".inst 0x6467e4ba  // bfmmla z26.s, z5.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      ".inst 0x6466e4be  // bfmmla z30.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6467e4bb  // bfmmla z27.s, z5.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bf  // bfmmla z31.s, z5.h, z6.h\n"
+      ".inst 0x6461e4ab  // bfmmla z11.s, z5.h, z1.h\n"
+      ".inst 0x6461e473  // bfmmla z19.s, z3.h, z1.h\n"
+      ".inst 0x6461e45b  // bfmmla z27.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6460e4af  // bfmmla z15.s, z5.h, z0.h\n"
+      ".inst 0x6460e477  // bfmmla z23.s, z3.h, z0.h\n"
+      ".inst 0x6460e45f  // bfmmla z31.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6461e4c8  // bfmmla z8.s, z6.h, z1.h\n"
+      ".inst 0x6461e4f0  // bfmmla z16.s, z7.h, z1.h\n"
+      ".inst 0x6461e498  // bfmmla z24.s, z4.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6460e4cc  // bfmmla z12.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f4  // bfmmla z20.s, z7.h, z0.h\n"
+      ".inst 0x6460e49c  // bfmmla z28.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6461e4c9  // bfmmla z9.s, z6.h, z1.h\n"
+      ".inst 0x6461e4f1  // bfmmla z17.s, z7.h, z1.h\n"
+      ".inst 0x6461e499  // bfmmla z25.s, z4.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6460e4cd  // bfmmla z13.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f5  // bfmmla z21.s, z7.h, z0.h\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6461e4ca  // bfmmla z10.s, z6.h, z1.h\n"
+      ".inst 0x6461e4f2  // bfmmla z18.s, z7.h, z1.h\n"
+      ".inst 0x6461e49a  // bfmmla z26.s, z4.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6460e4ce  // bfmmla z14.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f6  // bfmmla z22.s, z7.h, z0.h\n"
+      ".inst 0x6460e49e  // bfmmla z30.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6461e4cb  // bfmmla z11.s, z6.h, z1.h\n"
+      ".inst 0x6461e4f3  // bfmmla z19.s, z7.h, z1.h\n"
+      ".inst 0x6461e49b  // bfmmla z27.s, z4.h, z1.h\n"
+      ".inst 0x6460e4cf  // bfmmla z15.s, z6.h, z0.h\n"
+      ".inst 0x6460e4f7  // bfmmla z23.s, z7.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
       "bgt 61b\n"
       "62:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
+      "ld1rqh { z4.h }, p0/Z, [x25]\n"
       "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "ld1rqh { z4.h }, p0/Z, [x23]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "trn1 z7.d, z1.d, z4.d\n"
+      "trn2 z1.d, z1.d, z4.d\n"
       "ld1rqh { z5.h }, p0/Z, [x22]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6467e498  // bfmmla z24.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e4e8  // bfmmla z8.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d0  // bfmmla z16.s, z6.h, z2.h\n"
+      ".inst 0x6462e498  // bfmmla z24.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
-      ".inst 0x6466e49c  // bfmmla z28.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      ".inst 0x6467e499  // bfmmla z25.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6466e49d  // bfmmla z29.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      ".inst 0x6467e49a  // bfmmla z26.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      ".inst 0x6466e49e  // bfmmla z30.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d4  // bfmmla z20.s, z6.h, z0.h\n"
+      ".inst 0x6460e49c  // bfmmla z28.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6462e4e9  // bfmmla z9.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d1  // bfmmla z17.s, z6.h, z2.h\n"
+      ".inst 0x6462e499  // bfmmla z25.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d5  // bfmmla z21.s, z6.h, z0.h\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6462e4ea  // bfmmla z10.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d2  // bfmmla z18.s, z6.h, z2.h\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d6  // bfmmla z22.s, z6.h, z0.h\n"
+      ".inst 0x6460e49e  // bfmmla z30.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6462e4eb  // bfmmla z11.s, z7.h, z2.h\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
-      ".inst 0x6467e49b  // bfmmla z27.s, z4.h, z7.h\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6466e49f  // bfmmla z31.s, z4.h, z6.h\n"
+      ".inst 0x6462e4d3  // bfmmla z19.s, z6.h, z2.h\n"
+      ".inst 0x6462e49b  // bfmmla z27.s, z4.h, z2.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d7  // bfmmla z23.s, z6.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
       "ble 63f\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b8  // bfmmla z24.s, z5.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bc  // bfmmla z28.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bd  // bfmmla z29.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      ".inst 0x6467e4ba  // bfmmla z26.s, z5.h, z7.h\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      ".inst 0x6466e4be  // bfmmla z30.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e428  // bfmmla z8.s, z1.h, z2.h\n"
+      ".inst 0x6462e470  // bfmmla z16.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b8  // bfmmla z24.s, z5.h, z2.h\n"
+      ".inst 0x6460e42c  // bfmmla z12.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460e474  // bfmmla z20.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bc  // bfmmla z28.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6462e429  // bfmmla z9.s, z1.h, z2.h\n"
+      ".inst 0x6462e471  // bfmmla z17.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b9  // bfmmla z25.s, z5.h, z2.h\n"
+      ".inst 0x6460e42d  // bfmmla z13.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e475  // bfmmla z21.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6462e42a  // bfmmla z10.s, z1.h, z2.h\n"
+      ".inst 0x6462e472  // bfmmla z18.s, z3.h, z2.h\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6460e42e  // bfmmla z14.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e476  // bfmmla z22.s, z3.h, z0.h\n"
+      ".inst 0x6460e4be  // bfmmla z30.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6467e4bb  // bfmmla z27.s, z5.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bf  // bfmmla z31.s, z5.h, z6.h\n"
+      ".inst 0x6462e42b  // bfmmla z11.s, z1.h, z2.h\n"
+      ".inst 0x6462e473  // bfmmla z19.s, z3.h, z2.h\n"
+      ".inst 0x6462e4bb  // bfmmla z27.s, z5.h, z2.h\n"
+      ".inst 0x6460e42f  // bfmmla z15.s, z1.h, z0.h\n"
+      ".inst 0x6460e477  // bfmmla z23.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
       "63:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1500,49 +1500,49 @@
       "uzp1 z27.d, z27.d, z31.d\n"
       "tbz %x[flags], #1, 64f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
       "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z7.s, p5/M, z7.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z1.s\n"
-      "fmin z22.s, p5/M, z22.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmin z24.s, p5/M, z24.s, z1.s\n"
-      "fmin z25.s, p5/M, z25.s, z1.s\n"
-      "fmin z26.s, p5/M, z26.s, z1.s\n"
-      "fmin z27.s, p5/M, z27.s, z1.s\n"
-      "fmax z7.s, p5/M, z7.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z0.s\n"
-      "fmax z22.s, p5/M, z22.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
-      "fmax z24.s, p5/M, z24.s, z0.s\n"
-      "fmax z25.s, p5/M, z25.s, z0.s\n"
-      "fmax z26.s, p5/M, z26.s, z0.s\n"
-      "fmax z27.s, p5/M, z27.s, z0.s\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z23.s }, p5/Z, [x20]\n"
+      "fmin z7.s, p5/M, z7.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z7.s, p5/M, z7.s, z23.s\n"
+      "fmax z12.s, p5/M, z12.s, z23.s\n"
+      "fmax z13.s, p5/M, z13.s, z23.s\n"
+      "fmax z14.s, p5/M, z14.s, z23.s\n"
+      "fmax z8.s, p5/M, z8.s, z23.s\n"
+      "fmax z9.s, p5/M, z9.s, z23.s\n"
+      "fmax z10.s, p5/M, z10.s, z23.s\n"
+      "fmax z11.s, p5/M, z11.s, z23.s\n"
+      "fmax z15.s, p5/M, z15.s, z23.s\n"
+      "fmax z20.s, p5/M, z20.s, z23.s\n"
+      "fmax z21.s, p5/M, z21.s, z23.s\n"
+      "fmax z22.s, p5/M, z22.s, z23.s\n"
+      "fmax z16.s, p5/M, z16.s, z23.s\n"
+      "fmax z17.s, p5/M, z17.s, z23.s\n"
+      "fmax z18.s, p5/M, z18.s, z23.s\n"
+      "fmax z19.s, p5/M, z19.s, z23.s\n"
+      "fmax z24.s, p5/M, z24.s, z23.s\n"
+      "fmax z25.s, p5/M, z25.s, z23.s\n"
+      "fmax z26.s, p5/M, z26.s, z23.s\n"
+      "fmax z27.s, p5/M, z27.s, z23.s\n"
       "64:"  // Height 5: No activation
       "st1w { z7.s }, p4, [x9]\n"
       "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -1621,59 +1621,59 @@
       "68:"  // Height 6: no bias
       "tbz %x[flags], #0, 69f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
+      "add x24, x9, x20, LSL #2\n"
       "add x23, x24, x20, LSL #2\n"
+      "ld1w { z17.s }, p4/Z, [x9]\n"
       "add x22, x23, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
       "add x21, x22, x20, LSL #2\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x24]\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "zip1 z8.d, z17.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "zip2 z12.d, z17.d, z12.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z20.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip2 z14.d, z20.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
       "zip2 z15.d, z16.d, z15.d\n"
       "zip1 z16.d, z17.d, z20.d\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
       "zip2 z20.d, z17.d, z20.d\n"
       "zip1 z17.d, z18.d, z21.d\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z25.s }, p4/Z, [x22]\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x21]\n"
       "zip2 z21.d, z18.d, z21.d\n"
       "zip1 z18.d, z19.d, z22.d\n"
-      "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip2 z22.d, z19.d, z22.d\n"
       "zip1 z19.d, z24.d, z23.d\n"
-      "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p4/Z, [x21]\n"
+      "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
       "zip2 z23.d, z24.d, z23.d\n"
       "zip1 z24.d, z25.d, z28.d\n"
-      "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip2 z28.d, z25.d, z28.d\n"
       "zip1 z25.d, z26.d, z29.d\n"
-      "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip2 z29.d, z26.d, z29.d\n"
       "zip1 z26.d, z27.d, z30.d\n"
       "zip2 z30.d, z27.d, z30.d\n"
-      "zip1 z27.d, z6.d, z31.d\n"
-      "zip2 z31.d, z6.d, z31.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
       "b 70f\n"
       "69:"  // Height 6: no accumulate
       "mov z8.b, #0x0\n"
@@ -1705,16 +1705,16 @@
       "71:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 72f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 73f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1726,184 +1726,184 @@
       "b 73f\n"
       "72:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "73:"  // Height 6: input setup done
       "cmp x27, #0x8\n"
       "ble 75f\n"
       "74:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "ld1rqh { z4.h }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1rqh { z5.h }, p0/Z, [x22]\n"
-      "ld1rqh { z6.h }, p0/Z, [x21]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6467e498  // bfmmla z24.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1rqh { z7.h }, p0/Z, [x26]\n"
+      "ld1rqh { z0.h }, p0/Z, [x25]\n"
+      "trn1 z6.d, z7.d, z0.d\n"
+      "ld1rqh { z5.h }, p0/Z, [x24]\n"
+      "ld1rqh { z1.h }, p0/Z, [x23]\n"
+      "trn2 z7.d, z7.d, z0.d\n"
+      "trn1 z4.d, z5.d, z1.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x22]\n"
+      "ld1rqh { z0.h }, p0/Z, [x21]\n"
+      "trn2 z5.d, z5.d, z1.d\n"
+      "trn1 z2.d, z3.d, z0.d\n"
+      "trn2 z3.d, z3.d, z0.d\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6461e4c8  // bfmmla z8.s, z6.h, z1.h\n"
+      ".inst 0x6461e490  // bfmmla z16.s, z4.h, z1.h\n"
+      ".inst 0x6461e458  // bfmmla z24.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
       "sub x27, x27, #0x8\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      ".inst 0x6460e4cc  // bfmmla z12.s, z6.h, z0.h\n"
+      ".inst 0x6460e494  // bfmmla z20.s, z4.h, z0.h\n"
       "cmp x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6466e49c  // bfmmla z28.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6460e45c  // bfmmla z28.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461e4c9  // bfmmla z9.s, z6.h, z1.h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      ".inst 0x6467e499  // bfmmla z25.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6461e491  // bfmmla z17.s, z4.h, z1.h\n"
+      ".inst 0x6461e459  // bfmmla z25.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6460e4cd  // bfmmla z13.s, z6.h, z0.h\n"
+      ".inst 0x6460e495  // bfmmla z21.s, z4.h, z0.h\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x6466e49d  // bfmmla z29.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      ".inst 0x6460e45d  // bfmmla z29.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6461e4ca  // bfmmla z10.s, z6.h, z1.h\n"
       "add x21, x21, #0x10\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      ".inst 0x6467e49a  // bfmmla z26.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      ".inst 0x6466e49e  // bfmmla z30.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6461e492  // bfmmla z18.s, z4.h, z1.h\n"
+      ".inst 0x6461e45a  // bfmmla z26.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4ce  // bfmmla z14.s, z6.h, z0.h\n"
+      ".inst 0x6460e496  // bfmmla z22.s, z4.h, z0.h\n"
+      ".inst 0x6460e45e  // bfmmla z30.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
-      ".inst 0x6467e49b  // bfmmla z27.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6466e49f  // bfmmla z31.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b8  // bfmmla z24.s, z5.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bc  // bfmmla z28.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bd  // bfmmla z29.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      ".inst 0x6467e4ba  // bfmmla z26.s, z5.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      ".inst 0x6466e4be  // bfmmla z30.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6467e4bb  // bfmmla z27.s, z5.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bf  // bfmmla z31.s, z5.h, z6.h\n"
+      ".inst 0x6461e4cb  // bfmmla z11.s, z6.h, z1.h\n"
+      ".inst 0x6461e493  // bfmmla z19.s, z4.h, z1.h\n"
+      ".inst 0x6461e45b  // bfmmla z27.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6460e4cf  // bfmmla z15.s, z6.h, z0.h\n"
+      ".inst 0x6460e497  // bfmmla z23.s, z4.h, z0.h\n"
+      ".inst 0x6460e45f  // bfmmla z31.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6461e4e8  // bfmmla z8.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b0  // bfmmla z16.s, z5.h, z1.h\n"
+      ".inst 0x6461e478  // bfmmla z24.s, z3.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b4  // bfmmla z20.s, z5.h, z0.h\n"
+      ".inst 0x6460e47c  // bfmmla z28.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b1  // bfmmla z17.s, z5.h, z1.h\n"
+      ".inst 0x6461e479  // bfmmla z25.s, z3.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b5  // bfmmla z21.s, z5.h, z0.h\n"
+      ".inst 0x6460e47d  // bfmmla z29.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6461e4ea  // bfmmla z10.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b2  // bfmmla z18.s, z5.h, z1.h\n"
+      ".inst 0x6461e47a  // bfmmla z26.s, z3.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b6  // bfmmla z22.s, z5.h, z0.h\n"
+      ".inst 0x6460e47e  // bfmmla z30.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b3  // bfmmla z19.s, z5.h, z1.h\n"
+      ".inst 0x6461e47b  // bfmmla z27.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b7  // bfmmla z23.s, z5.h, z0.h\n"
+      ".inst 0x6460e47f  // bfmmla z31.s, z3.h, z0.h\n"
       "bgt 74b\n"
       "75:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z1.h }, p0/Z, [x26]\n"
-      "ld1rqh { z2.h }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqh { z0.h }, p0/Z, [x25]\n"
+      "trn1 z7.d, z1.d, z0.d\n"
       "ld1rqh { z3.h }, p0/Z, [x24]\n"
-      "ld1rqh { z4.h }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
+      "ld1rqh { z2.h }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z0.d\n"
+      "trn1 z6.d, z3.d, z2.d\n"
       "ld1rqh { z5.h }, p0/Z, [x22]\n"
-      "ld1rqh { z6.h }, p0/Z, [x21]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
-      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
-      ".inst 0x6467e498  // bfmmla z24.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1rqh { z0.h }, p0/Z, [x21]\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e4e8  // bfmmla z8.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d0  // bfmmla z16.s, z6.h, z2.h\n"
+      ".inst 0x6462e498  // bfmmla z24.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
-      ".inst 0x6466e49c  // bfmmla z28.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
-      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
-      ".inst 0x6467e499  // bfmmla z25.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6466e49d  // bfmmla z29.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
-      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
-      ".inst 0x6467e49a  // bfmmla z26.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      ".inst 0x6466e49e  // bfmmla z30.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d4  // bfmmla z20.s, z6.h, z0.h\n"
+      ".inst 0x6460e49c  // bfmmla z28.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6462e4e9  // bfmmla z9.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d1  // bfmmla z17.s, z6.h, z2.h\n"
+      ".inst 0x6462e499  // bfmmla z25.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d5  // bfmmla z21.s, z6.h, z0.h\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6462e4ea  // bfmmla z10.s, z7.h, z2.h\n"
+      ".inst 0x6462e4d2  // bfmmla z18.s, z6.h, z2.h\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d6  // bfmmla z22.s, z6.h, z0.h\n"
+      ".inst 0x6460e49e  // bfmmla z30.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6462e4eb  // bfmmla z11.s, z7.h, z2.h\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
-      ".inst 0x6467e49b  // bfmmla z27.s, z4.h, z7.h\n"
-      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6466e49f  // bfmmla z31.s, z4.h, z6.h\n"
+      ".inst 0x6462e4d3  // bfmmla z19.s, z6.h, z2.h\n"
+      ".inst 0x6462e49b  // bfmmla z27.s, z4.h, z2.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4d7  // bfmmla z23.s, z6.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
       "ble 76f\n"
-      "ld1h { z7.h }, p5/Z, [x10]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
-      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b8  // bfmmla z24.s, z5.h, z7.h\n"
-      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bc  // bfmmla z28.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
-      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
-      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
-      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bd  // bfmmla z29.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
-      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
-      ".inst 0x6467e4ba  // bfmmla z26.s, z5.h, z7.h\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
-      ".inst 0x6466e4be  // bfmmla z30.s, z5.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1h { z2.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6462e428  // bfmmla z8.s, z1.h, z2.h\n"
+      ".inst 0x6462e470  // bfmmla z16.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b8  // bfmmla z24.s, z5.h, z2.h\n"
+      ".inst 0x6460e42c  // bfmmla z12.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460e474  // bfmmla z20.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bc  // bfmmla z28.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6462e429  // bfmmla z9.s, z1.h, z2.h\n"
+      ".inst 0x6462e471  // bfmmla z17.s, z3.h, z2.h\n"
+      ".inst 0x6462e4b9  // bfmmla z25.s, z5.h, z2.h\n"
+      ".inst 0x6460e42d  // bfmmla z13.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e475  // bfmmla z21.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6462e42a  // bfmmla z10.s, z1.h, z2.h\n"
+      ".inst 0x6462e472  // bfmmla z18.s, z3.h, z2.h\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6460e42e  // bfmmla z14.s, z1.h, z0.h\n"
+      "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e476  // bfmmla z22.s, z3.h, z0.h\n"
+      ".inst 0x6460e4be  // bfmmla z30.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
-      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
-      ".inst 0x6467e4bb  // bfmmla z27.s, z5.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
-      ".inst 0x6466e4bf  // bfmmla z31.s, z5.h, z6.h\n"
+      ".inst 0x6462e42b  // bfmmla z11.s, z1.h, z2.h\n"
+      ".inst 0x6462e473  // bfmmla z19.s, z3.h, z2.h\n"
+      ".inst 0x6462e4bb  // bfmmla z27.s, z5.h, z2.h\n"
+      ".inst 0x6460e42f  // bfmmla z15.s, z1.h, z0.h\n"
+      ".inst 0x6460e477  // bfmmla z23.s, z3.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
       "76:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -2041,4 +2041,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
index 6db9c0c..b930e4c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,16 +10,16 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
 #ifdef ARM_COMPUTE_ENABLE_SVE
@@ -75,7 +75,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, __fp16>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -84,6 +83,8 @@
                     return { 12.44 };
                 case CPUModel::V1:
                     return { 31.51 };
+                case CPUModel::A64FX:
+                    return { 49.14 };
             }
         }
 
@@ -107,5 +108,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
-#endif // __aarch64__
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp
index a70e66c..d1a9bb4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp
@@ -139,11 +139,11 @@
       "6:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 7f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 8f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -159,12 +159,12 @@
       "9:"  // Height 1: Multiply loop: Main loop
       "fmla z8.h, p4/M, z6.h, z0.h\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
-      "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
       "add x26, x26, #0x2\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
       "subs x27, x27, #0x1\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
       "ld1h { z6.h }, p4/Z, [x10]\n"
@@ -174,27 +174,27 @@
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "fmla z8.h, p4/M, z6.h, z0.h\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
-      "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
       "addvl x10, x10, #4\n"
       "bne 6b\n"
       "tbz %x[flags], #1, 11f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p4/Z, [x20]\n"
+      "ld1rh { z17.h }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p4/Z, [x20]\n"
-      "fmin z8.h, p4/M, z8.h, z1.h\n"
-      "fmin z9.h, p4/M, z9.h, z1.h\n"
-      "fmin z10.h, p4/M, z10.h, z1.h\n"
-      "fmin z11.h, p4/M, z11.h, z1.h\n"
-      "fmax z8.h, p4/M, z8.h, z0.h\n"
-      "fmax z9.h, p4/M, z9.h, z0.h\n"
-      "fmax z10.h, p4/M, z10.h, z0.h\n"
-      "fmax z11.h, p4/M, z11.h, z0.h\n"
+      "ld1rh { z16.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z17.h\n"
+      "fmin z9.h, p4/M, z9.h, z17.h\n"
+      "fmin z10.h, p4/M, z10.h, z17.h\n"
+      "fmin z11.h, p4/M, z11.h, z17.h\n"
+      "fmax z8.h, p4/M, z8.h, z16.h\n"
+      "fmax z9.h, p4/M, z9.h, z16.h\n"
+      "fmax z10.h, p4/M, z10.h, z16.h\n"
+      "fmax z11.h, p4/M, z11.h, z16.h\n"
       "11:"  // Height 1: No activation
       "st1h { z8.h }, p3, [x9]\n"
       "st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
@@ -234,15 +234,15 @@
       "15:"  // Height 2: no bias
       "tbz %x[flags], #0, 16f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
       "ld1h { z8.h }, p3/Z, [x9]\n"
       "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1h { z12.h }, p3/Z, [x25]\n"
-      "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x20]\n"
+      "ld1h { z13.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x20, #3, MUL VL]\n"
       "b 17f\n"
       "16:"  // Height 2: no accumulate
       "mov z8.b, #0x0\n"
@@ -258,12 +258,12 @@
       "18:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 19f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 20f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -271,7 +271,7 @@
       "b 20f\n"
       "19:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
       "20:"  // Height 2: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -282,18 +282,18 @@
       "21:"  // Height 2: Multiply loop: Main loop
       "fmla z8.h, p4/M, z6.h, z0.h\n"
       "fmla z12.h, p4/M, z6.h, z1.h\n"
-      "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
       "add x26, x26, #0x2\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
-      "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
       "subs x27, x27, #0x1\n"
       "add x25, x25, #0x2\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z14.h, p4/M, z17.h, z1.h\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
+      "fmla z15.h, p4/M, z16.h, z1.h\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
       "ld1rh { z1.h }, p4/Z, [x25]\n"
       "ld1h { z6.h }, p4/Z, [x10]\n"
@@ -303,41 +303,41 @@
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "fmla z8.h, p4/M, z6.h, z0.h\n"
       "fmla z12.h, p4/M, z6.h, z1.h\n"
-      "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
-      "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z10.h, p4/M, z17.h, z0.h\n"
+      "fmla z14.h, p4/M, z17.h, z1.h\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "fmla z11.h, p4/M, z16.h, z0.h\n"
+      "fmla z15.h, p4/M, z16.h, z1.h\n"
       "bne 18b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x25, x9, x20, LSL #1\n"
       "tbz %x[flags], #1, 23f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p4/Z, [x20]\n"
+      "ld1rh { z17.h }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p4/Z, [x20]\n"
-      "fmin z8.h, p4/M, z8.h, z1.h\n"
-      "fmin z9.h, p4/M, z9.h, z1.h\n"
-      "fmin z10.h, p4/M, z10.h, z1.h\n"
-      "fmin z11.h, p4/M, z11.h, z1.h\n"
-      "fmin z12.h, p4/M, z12.h, z1.h\n"
-      "fmin z13.h, p4/M, z13.h, z1.h\n"
-      "fmin z14.h, p4/M, z14.h, z1.h\n"
-      "fmin z15.h, p4/M, z15.h, z1.h\n"
-      "fmax z8.h, p4/M, z8.h, z0.h\n"
-      "fmax z9.h, p4/M, z9.h, z0.h\n"
-      "fmax z10.h, p4/M, z10.h, z0.h\n"
-      "fmax z11.h, p4/M, z11.h, z0.h\n"
-      "fmax z12.h, p4/M, z12.h, z0.h\n"
-      "fmax z13.h, p4/M, z13.h, z0.h\n"
-      "fmax z14.h, p4/M, z14.h, z0.h\n"
-      "fmax z15.h, p4/M, z15.h, z0.h\n"
+      "ld1rh { z16.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z17.h\n"
+      "fmin z9.h, p4/M, z9.h, z17.h\n"
+      "fmin z10.h, p4/M, z10.h, z17.h\n"
+      "fmin z11.h, p4/M, z11.h, z17.h\n"
+      "fmin z12.h, p4/M, z12.h, z17.h\n"
+      "fmin z13.h, p4/M, z13.h, z17.h\n"
+      "fmin z14.h, p4/M, z14.h, z17.h\n"
+      "fmin z15.h, p4/M, z15.h, z17.h\n"
+      "fmax z8.h, p4/M, z8.h, z16.h\n"
+      "fmax z9.h, p4/M, z9.h, z16.h\n"
+      "fmax z10.h, p4/M, z10.h, z16.h\n"
+      "fmax z11.h, p4/M, z11.h, z16.h\n"
+      "fmax z12.h, p4/M, z12.h, z16.h\n"
+      "fmax z13.h, p4/M, z13.h, z16.h\n"
+      "fmax z14.h, p4/M, z14.h, z16.h\n"
+      "fmax z15.h, p4/M, z15.h, z16.h\n"
       "23:"  // Height 2: No activation
       "st1h { z8.h }, p3, [x9]\n"
       "st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
@@ -385,20 +385,20 @@
       "27:"  // Height 3: no bias
       "tbz %x[flags], #0, 28f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x21, x9, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z8.h }, p3/Z, [x9]\n"
       "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1h { z12.h }, p3/Z, [x25]\n"
-      "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p3/Z, [x24]\n"
-      "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x21]\n"
+      "ld1h { z13.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x20]\n"
+      "ld1h { z17.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x20, #3, MUL VL]\n"
       "b 29f\n"
       "28:"  // Height 3: no accumulate
       "mov z8.b, #0x0\n"
@@ -418,13 +418,13 @@
       "30:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 31f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 32f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -433,8 +433,8 @@
       "b 32f\n"
       "31:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "32:"  // Height 3: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -450,21 +450,21 @@
       "subs x27, x27, #0x1\n"
       "fmla z16.h, p4/M, z6.h, z2.h\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
-      "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z21.h }, p4/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x2\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
       "fmla z17.h, p4/M, z7.h, z2.h\n"
-      "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
       "add x24, x24, #0x2\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
-      "fmla z18.h, p4/M, z6.h, z2.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z10.h, p4/M, z21.h, z0.h\n"
+      "fmla z14.h, p4/M, z21.h, z1.h\n"
+      "fmla z18.h, p4/M, z21.h, z2.h\n"
+      "fmla z11.h, p4/M, z20.h, z0.h\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
       "ld1h { z6.h }, p4/Z, [x10]\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
-      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z15.h, p4/M, z20.h, z1.h\n"
+      "fmla z19.h, p4/M, z20.h, z2.h\n"
       "ld1rh { z1.h }, p4/Z, [x25]\n"
       "ld1rh { z2.h }, p4/Z, [x24]\n"
       "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
@@ -476,51 +476,51 @@
       "add x28, x28, #0x1\n"
       "fmla z16.h, p4/M, z6.h, z2.h\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
-      "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z21.h }, p4/Z, [x10, #2, MUL VL]\n"
       "cmp x28, x20\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
       "fmla z17.h, p4/M, z7.h, z2.h\n"
-      "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
-      "fmla z18.h, p4/M, z6.h, z2.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
-      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z10.h, p4/M, z21.h, z0.h\n"
+      "fmla z14.h, p4/M, z21.h, z1.h\n"
+      "fmla z18.h, p4/M, z21.h, z2.h\n"
+      "fmla z11.h, p4/M, z20.h, z0.h\n"
+      "fmla z15.h, p4/M, z20.h, z1.h\n"
+      "fmla z19.h, p4/M, z20.h, z2.h\n"
       "bne 30b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x25, x9, x20, LSL #1\n"
       "add x24, x25, x20, LSL #1\n"
       "tbz %x[flags], #1, 35f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p4/Z, [x20]\n"
+      "ld1rh { z21.h }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p4/Z, [x20]\n"
-      "fmin z8.h, p4/M, z8.h, z1.h\n"
-      "fmin z9.h, p4/M, z9.h, z1.h\n"
-      "fmin z10.h, p4/M, z10.h, z1.h\n"
-      "fmin z11.h, p4/M, z11.h, z1.h\n"
-      "fmin z12.h, p4/M, z12.h, z1.h\n"
-      "fmin z13.h, p4/M, z13.h, z1.h\n"
-      "fmin z14.h, p4/M, z14.h, z1.h\n"
-      "fmin z15.h, p4/M, z15.h, z1.h\n"
-      "fmin z16.h, p4/M, z16.h, z1.h\n"
-      "fmin z17.h, p4/M, z17.h, z1.h\n"
-      "fmin z18.h, p4/M, z18.h, z1.h\n"
-      "fmin z19.h, p4/M, z19.h, z1.h\n"
-      "fmax z8.h, p4/M, z8.h, z0.h\n"
-      "fmax z9.h, p4/M, z9.h, z0.h\n"
-      "fmax z10.h, p4/M, z10.h, z0.h\n"
-      "fmax z11.h, p4/M, z11.h, z0.h\n"
-      "fmax z12.h, p4/M, z12.h, z0.h\n"
-      "fmax z13.h, p4/M, z13.h, z0.h\n"
-      "fmax z14.h, p4/M, z14.h, z0.h\n"
-      "fmax z15.h, p4/M, z15.h, z0.h\n"
-      "fmax z16.h, p4/M, z16.h, z0.h\n"
-      "fmax z17.h, p4/M, z17.h, z0.h\n"
-      "fmax z18.h, p4/M, z18.h, z0.h\n"
-      "fmax z19.h, p4/M, z19.h, z0.h\n"
+      "ld1rh { z20.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z21.h\n"
+      "fmin z9.h, p4/M, z9.h, z21.h\n"
+      "fmin z10.h, p4/M, z10.h, z21.h\n"
+      "fmin z11.h, p4/M, z11.h, z21.h\n"
+      "fmin z12.h, p4/M, z12.h, z21.h\n"
+      "fmin z13.h, p4/M, z13.h, z21.h\n"
+      "fmin z14.h, p4/M, z14.h, z21.h\n"
+      "fmin z15.h, p4/M, z15.h, z21.h\n"
+      "fmin z16.h, p4/M, z16.h, z21.h\n"
+      "fmin z17.h, p4/M, z17.h, z21.h\n"
+      "fmin z18.h, p4/M, z18.h, z21.h\n"
+      "fmin z19.h, p4/M, z19.h, z21.h\n"
+      "fmax z8.h, p4/M, z8.h, z20.h\n"
+      "fmax z9.h, p4/M, z9.h, z20.h\n"
+      "fmax z10.h, p4/M, z10.h, z20.h\n"
+      "fmax z11.h, p4/M, z11.h, z20.h\n"
+      "fmax z12.h, p4/M, z12.h, z20.h\n"
+      "fmax z13.h, p4/M, z13.h, z20.h\n"
+      "fmax z14.h, p4/M, z14.h, z20.h\n"
+      "fmax z15.h, p4/M, z15.h, z20.h\n"
+      "fmax z16.h, p4/M, z16.h, z20.h\n"
+      "fmax z17.h, p4/M, z17.h, z20.h\n"
+      "fmax z18.h, p4/M, z18.h, z20.h\n"
+      "fmax z19.h, p4/M, z19.h, z20.h\n"
       "35:"  // Height 3: No activation
       "st1h { z8.h }, p3, [x9]\n"
       "st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
@@ -576,25 +576,25 @@
       "39:"  // Height 4: no bias
       "tbz %x[flags], #0, 40f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x22, x9, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
       "ld1h { z8.h }, p3/Z, [x9]\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1h { z12.h }, p3/Z, [x25]\n"
-      "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p3/Z, [x24]\n"
-      "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z20.h }, p3/Z, [x23]\n"
-      "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x22]\n"
+      "ld1h { z13.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x21]\n"
+      "ld1h { z17.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x20]\n"
+      "ld1h { z21.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x20, #3, MUL VL]\n"
       "b 41f\n"
       "40:"  // Height 4: no accumulate
       "mov z8.b, #0x0\n"
@@ -618,14 +618,14 @@
       "42:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 43f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 44f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -635,9 +635,9 @@
       "b 44f\n"
       "43:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "44:"  // Height 4: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -654,7 +654,7 @@
       "subs x27, x27, #0x1\n"
       "fmla z16.h, p4/M, z6.h, z2.h\n"
       "fmla z20.h, p4/M, z6.h, z3.h\n"
-      "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z25.h }, p4/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x2\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
@@ -662,19 +662,19 @@
       "add x23, x23, #0x2\n"
       "fmla z17.h, p4/M, z7.h, z2.h\n"
       "fmla z21.h, p4/M, z7.h, z3.h\n"
-      "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
-      "fmla z18.h, p4/M, z6.h, z2.h\n"
-      "fmla z22.h, p4/M, z6.h, z3.h\n"
+      "fmla z10.h, p4/M, z25.h, z0.h\n"
+      "fmla z14.h, p4/M, z25.h, z1.h\n"
+      "fmla z18.h, p4/M, z25.h, z2.h\n"
+      "fmla z22.h, p4/M, z25.h, z3.h\n"
       "ld1h { z6.h }, p4/Z, [x10]\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "fmla z11.h, p4/M, z24.h, z0.h\n"
+      "fmla z15.h, p4/M, z24.h, z1.h\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
       "ld1rh { z1.h }, p4/Z, [x25]\n"
-      "fmla z19.h, p4/M, z7.h, z2.h\n"
-      "fmla z23.h, p4/M, z7.h, z3.h\n"
+      "fmla z19.h, p4/M, z24.h, z2.h\n"
+      "fmla z23.h, p4/M, z24.h, z3.h\n"
       "ld1rh { z2.h }, p4/Z, [x24]\n"
       "ld1rh { z3.h }, p4/Z, [x23]\n"
       "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
@@ -686,22 +686,22 @@
       "add x28, x28, #0x1\n"
       "fmla z16.h, p4/M, z6.h, z2.h\n"
       "fmla z20.h, p4/M, z6.h, z3.h\n"
-      "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z25.h }, p4/Z, [x10, #2, MUL VL]\n"
       "cmp x28, x20\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
       "fmla z17.h, p4/M, z7.h, z2.h\n"
       "fmla z21.h, p4/M, z7.h, z3.h\n"
-      "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
-      "fmla z18.h, p4/M, z6.h, z2.h\n"
-      "fmla z22.h, p4/M, z6.h, z3.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
-      "fmla z19.h, p4/M, z7.h, z2.h\n"
-      "fmla z23.h, p4/M, z7.h, z3.h\n"
+      "fmla z10.h, p4/M, z25.h, z0.h\n"
+      "fmla z14.h, p4/M, z25.h, z1.h\n"
+      "fmla z18.h, p4/M, z25.h, z2.h\n"
+      "fmla z22.h, p4/M, z25.h, z3.h\n"
+      "fmla z11.h, p4/M, z24.h, z0.h\n"
+      "fmla z15.h, p4/M, z24.h, z1.h\n"
+      "fmla z19.h, p4/M, z24.h, z2.h\n"
+      "fmla z23.h, p4/M, z24.h, z3.h\n"
       "bne 42b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x25, x9, x20, LSL #1\n"
@@ -709,41 +709,41 @@
       "add x23, x24, x20, LSL #1\n"
       "tbz %x[flags], #1, 47f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p4/Z, [x20]\n"
+      "ld1rh { z25.h }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p4/Z, [x20]\n"
-      "fmin z8.h, p4/M, z8.h, z1.h\n"
-      "fmin z9.h, p4/M, z9.h, z1.h\n"
-      "fmin z10.h, p4/M, z10.h, z1.h\n"
-      "fmin z11.h, p4/M, z11.h, z1.h\n"
-      "fmin z12.h, p4/M, z12.h, z1.h\n"
-      "fmin z13.h, p4/M, z13.h, z1.h\n"
-      "fmin z14.h, p4/M, z14.h, z1.h\n"
-      "fmin z15.h, p4/M, z15.h, z1.h\n"
-      "fmin z16.h, p4/M, z16.h, z1.h\n"
-      "fmin z17.h, p4/M, z17.h, z1.h\n"
-      "fmin z18.h, p4/M, z18.h, z1.h\n"
-      "fmin z19.h, p4/M, z19.h, z1.h\n"
-      "fmin z20.h, p4/M, z20.h, z1.h\n"
-      "fmin z21.h, p4/M, z21.h, z1.h\n"
-      "fmin z22.h, p4/M, z22.h, z1.h\n"
-      "fmin z23.h, p4/M, z23.h, z1.h\n"
-      "fmax z8.h, p4/M, z8.h, z0.h\n"
-      "fmax z9.h, p4/M, z9.h, z0.h\n"
-      "fmax z10.h, p4/M, z10.h, z0.h\n"
-      "fmax z11.h, p4/M, z11.h, z0.h\n"
-      "fmax z12.h, p4/M, z12.h, z0.h\n"
-      "fmax z13.h, p4/M, z13.h, z0.h\n"
-      "fmax z14.h, p4/M, z14.h, z0.h\n"
-      "fmax z15.h, p4/M, z15.h, z0.h\n"
-      "fmax z16.h, p4/M, z16.h, z0.h\n"
-      "fmax z17.h, p4/M, z17.h, z0.h\n"
-      "fmax z18.h, p4/M, z18.h, z0.h\n"
-      "fmax z19.h, p4/M, z19.h, z0.h\n"
-      "fmax z20.h, p4/M, z20.h, z0.h\n"
-      "fmax z21.h, p4/M, z21.h, z0.h\n"
-      "fmax z22.h, p4/M, z22.h, z0.h\n"
-      "fmax z23.h, p4/M, z23.h, z0.h\n"
+      "ld1rh { z24.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z25.h\n"
+      "fmin z9.h, p4/M, z9.h, z25.h\n"
+      "fmin z10.h, p4/M, z10.h, z25.h\n"
+      "fmin z11.h, p4/M, z11.h, z25.h\n"
+      "fmin z12.h, p4/M, z12.h, z25.h\n"
+      "fmin z13.h, p4/M, z13.h, z25.h\n"
+      "fmin z14.h, p4/M, z14.h, z25.h\n"
+      "fmin z15.h, p4/M, z15.h, z25.h\n"
+      "fmin z16.h, p4/M, z16.h, z25.h\n"
+      "fmin z17.h, p4/M, z17.h, z25.h\n"
+      "fmin z18.h, p4/M, z18.h, z25.h\n"
+      "fmin z19.h, p4/M, z19.h, z25.h\n"
+      "fmin z20.h, p4/M, z20.h, z25.h\n"
+      "fmin z21.h, p4/M, z21.h, z25.h\n"
+      "fmin z22.h, p4/M, z22.h, z25.h\n"
+      "fmin z23.h, p4/M, z23.h, z25.h\n"
+      "fmax z8.h, p4/M, z8.h, z24.h\n"
+      "fmax z9.h, p4/M, z9.h, z24.h\n"
+      "fmax z10.h, p4/M, z10.h, z24.h\n"
+      "fmax z11.h, p4/M, z11.h, z24.h\n"
+      "fmax z12.h, p4/M, z12.h, z24.h\n"
+      "fmax z13.h, p4/M, z13.h, z24.h\n"
+      "fmax z14.h, p4/M, z14.h, z24.h\n"
+      "fmax z15.h, p4/M, z15.h, z24.h\n"
+      "fmax z16.h, p4/M, z16.h, z24.h\n"
+      "fmax z17.h, p4/M, z17.h, z24.h\n"
+      "fmax z18.h, p4/M, z18.h, z24.h\n"
+      "fmax z19.h, p4/M, z19.h, z24.h\n"
+      "fmax z20.h, p4/M, z20.h, z24.h\n"
+      "fmax z21.h, p4/M, z21.h, z24.h\n"
+      "fmax z22.h, p4/M, z22.h, z24.h\n"
+      "fmax z23.h, p4/M, z23.h, z24.h\n"
       "47:"  // Height 4: No activation
       "st1h { z8.h }, p3, [x9]\n"
       "st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
@@ -807,30 +807,30 @@
       "51:"  // Height 5: no bias
       "tbz %x[flags], #0, 52f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "ld1h { z8.h }, p3/Z, [x9]\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x23, x9, x20, LSL #1\n"
       "add x22, x23, x20, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x9]\n"
+      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1h { z12.h }, p3/Z, [x25]\n"
-      "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p3/Z, [x24]\n"
-      "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z20.h }, p3/Z, [x23]\n"
-      "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n"
-      "ld1h { z24.h }, p3/Z, [x22]\n"
-      "ld1h { z25.h }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1h { z26.h }, p1/Z, [x22, #2, MUL VL]\n"
-      "ld1h { z27.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x23]\n"
+      "ld1h { z13.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x22]\n"
+      "ld1h { z17.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x21]\n"
+      "ld1h { z21.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z24.h }, p3/Z, [x20]\n"
+      "ld1h { z25.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z26.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z27.h }, p0/Z, [x20, #3, MUL VL]\n"
       "b 53f\n"
       "52:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -858,15 +858,15 @@
       "54:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 55f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 56f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -877,10 +877,10 @@
       "b 56f\n"
       "55:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "56:"  // Height 5: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -902,29 +902,29 @@
       "add x24, x24, #0x2\n"
       "fmla z24.h, p4/M, z6.h, z4.h\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
-      "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z29.h }, p4/Z, [x10, #2, MUL VL]\n"
       "add x23, x23, #0x2\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
       "fmla z17.h, p4/M, z7.h, z2.h\n"
       "add x22, x22, #0x2\n"
       "fmla z21.h, p4/M, z7.h, z3.h\n"
       "fmla z25.h, p4/M, z7.h, z4.h\n"
-      "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z28.h }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
-      "fmla z18.h, p4/M, z6.h, z2.h\n"
-      "fmla z22.h, p4/M, z6.h, z3.h\n"
-      "fmla z26.h, p4/M, z6.h, z4.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z10.h, p4/M, z29.h, z0.h\n"
+      "fmla z14.h, p4/M, z29.h, z1.h\n"
+      "fmla z18.h, p4/M, z29.h, z2.h\n"
+      "fmla z22.h, p4/M, z29.h, z3.h\n"
+      "fmla z26.h, p4/M, z29.h, z4.h\n"
+      "fmla z11.h, p4/M, z28.h, z0.h\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
       "ld1h { z6.h }, p4/Z, [x10]\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
-      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z15.h, p4/M, z28.h, z1.h\n"
+      "fmla z19.h, p4/M, z28.h, z2.h\n"
       "ld1rh { z1.h }, p4/Z, [x25]\n"
       "ld1rh { z2.h }, p4/Z, [x24]\n"
-      "fmla z23.h, p4/M, z7.h, z3.h\n"
-      "fmla z27.h, p4/M, z7.h, z4.h\n"
+      "fmla z23.h, p4/M, z28.h, z3.h\n"
+      "fmla z27.h, p4/M, z28.h, z4.h\n"
       "ld1rh { z3.h }, p4/Z, [x23]\n"
       "ld1rh { z4.h }, p4/Z, [x22]\n"
       "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n"
@@ -939,23 +939,23 @@
       "cmp x28, x20\n"
       "fmla z24.h, p4/M, z6.h, z4.h\n"
       "fmla z9.h, p4/M, z7.h, z0.h\n"
-      "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z29.h }, p4/Z, [x10, #2, MUL VL]\n"
       "fmla z13.h, p4/M, z7.h, z1.h\n"
       "fmla z17.h, p4/M, z7.h, z2.h\n"
       "fmla z21.h, p4/M, z7.h, z3.h\n"
       "fmla z25.h, p4/M, z7.h, z4.h\n"
-      "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z28.h }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, p4/M, z6.h, z0.h\n"
-      "fmla z14.h, p4/M, z6.h, z1.h\n"
-      "fmla z18.h, p4/M, z6.h, z2.h\n"
-      "fmla z22.h, p4/M, z6.h, z3.h\n"
-      "fmla z26.h, p4/M, z6.h, z4.h\n"
-      "fmla z11.h, p4/M, z7.h, z0.h\n"
-      "fmla z15.h, p4/M, z7.h, z1.h\n"
-      "fmla z19.h, p4/M, z7.h, z2.h\n"
-      "fmla z23.h, p4/M, z7.h, z3.h\n"
-      "fmla z27.h, p4/M, z7.h, z4.h\n"
+      "fmla z10.h, p4/M, z29.h, z0.h\n"
+      "fmla z14.h, p4/M, z29.h, z1.h\n"
+      "fmla z18.h, p4/M, z29.h, z2.h\n"
+      "fmla z22.h, p4/M, z29.h, z3.h\n"
+      "fmla z26.h, p4/M, z29.h, z4.h\n"
+      "fmla z11.h, p4/M, z28.h, z0.h\n"
+      "fmla z15.h, p4/M, z28.h, z1.h\n"
+      "fmla z19.h, p4/M, z28.h, z2.h\n"
+      "fmla z23.h, p4/M, z28.h, z3.h\n"
+      "fmla z27.h, p4/M, z28.h, z4.h\n"
       "bne 54b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x25, x9, x20, LSL #1\n"
@@ -964,49 +964,49 @@
       "add x22, x23, x20, LSL #1\n"
       "tbz %x[flags], #1, 59f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p4/Z, [x20]\n"
+      "ld1rh { z29.h }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p4/Z, [x20]\n"
-      "fmin z8.h, p4/M, z8.h, z1.h\n"
-      "fmin z9.h, p4/M, z9.h, z1.h\n"
-      "fmin z10.h, p4/M, z10.h, z1.h\n"
-      "fmin z11.h, p4/M, z11.h, z1.h\n"
-      "fmin z12.h, p4/M, z12.h, z1.h\n"
-      "fmin z13.h, p4/M, z13.h, z1.h\n"
-      "fmin z14.h, p4/M, z14.h, z1.h\n"
-      "fmin z15.h, p4/M, z15.h, z1.h\n"
-      "fmin z16.h, p4/M, z16.h, z1.h\n"
-      "fmin z17.h, p4/M, z17.h, z1.h\n"
-      "fmin z18.h, p4/M, z18.h, z1.h\n"
-      "fmin z19.h, p4/M, z19.h, z1.h\n"
-      "fmin z20.h, p4/M, z20.h, z1.h\n"
-      "fmin z21.h, p4/M, z21.h, z1.h\n"
-      "fmin z22.h, p4/M, z22.h, z1.h\n"
-      "fmin z23.h, p4/M, z23.h, z1.h\n"
-      "fmin z24.h, p4/M, z24.h, z1.h\n"
-      "fmin z25.h, p4/M, z25.h, z1.h\n"
-      "fmin z26.h, p4/M, z26.h, z1.h\n"
-      "fmin z27.h, p4/M, z27.h, z1.h\n"
-      "fmax z8.h, p4/M, z8.h, z0.h\n"
-      "fmax z9.h, p4/M, z9.h, z0.h\n"
-      "fmax z10.h, p4/M, z10.h, z0.h\n"
-      "fmax z11.h, p4/M, z11.h, z0.h\n"
-      "fmax z12.h, p4/M, z12.h, z0.h\n"
-      "fmax z13.h, p4/M, z13.h, z0.h\n"
-      "fmax z14.h, p4/M, z14.h, z0.h\n"
-      "fmax z15.h, p4/M, z15.h, z0.h\n"
-      "fmax z16.h, p4/M, z16.h, z0.h\n"
-      "fmax z17.h, p4/M, z17.h, z0.h\n"
-      "fmax z18.h, p4/M, z18.h, z0.h\n"
-      "fmax z19.h, p4/M, z19.h, z0.h\n"
-      "fmax z20.h, p4/M, z20.h, z0.h\n"
-      "fmax z21.h, p4/M, z21.h, z0.h\n"
-      "fmax z22.h, p4/M, z22.h, z0.h\n"
-      "fmax z23.h, p4/M, z23.h, z0.h\n"
-      "fmax z24.h, p4/M, z24.h, z0.h\n"
-      "fmax z25.h, p4/M, z25.h, z0.h\n"
-      "fmax z26.h, p4/M, z26.h, z0.h\n"
-      "fmax z27.h, p4/M, z27.h, z0.h\n"
+      "ld1rh { z28.h }, p4/Z, [x20]\n"
+      "fmin z8.h, p4/M, z8.h, z29.h\n"
+      "fmin z9.h, p4/M, z9.h, z29.h\n"
+      "fmin z10.h, p4/M, z10.h, z29.h\n"
+      "fmin z11.h, p4/M, z11.h, z29.h\n"
+      "fmin z12.h, p4/M, z12.h, z29.h\n"
+      "fmin z13.h, p4/M, z13.h, z29.h\n"
+      "fmin z14.h, p4/M, z14.h, z29.h\n"
+      "fmin z15.h, p4/M, z15.h, z29.h\n"
+      "fmin z16.h, p4/M, z16.h, z29.h\n"
+      "fmin z17.h, p4/M, z17.h, z29.h\n"
+      "fmin z18.h, p4/M, z18.h, z29.h\n"
+      "fmin z19.h, p4/M, z19.h, z29.h\n"
+      "fmin z20.h, p4/M, z20.h, z29.h\n"
+      "fmin z21.h, p4/M, z21.h, z29.h\n"
+      "fmin z22.h, p4/M, z22.h, z29.h\n"
+      "fmin z23.h, p4/M, z23.h, z29.h\n"
+      "fmin z24.h, p4/M, z24.h, z29.h\n"
+      "fmin z25.h, p4/M, z25.h, z29.h\n"
+      "fmin z26.h, p4/M, z26.h, z29.h\n"
+      "fmin z27.h, p4/M, z27.h, z29.h\n"
+      "fmax z8.h, p4/M, z8.h, z28.h\n"
+      "fmax z9.h, p4/M, z9.h, z28.h\n"
+      "fmax z10.h, p4/M, z10.h, z28.h\n"
+      "fmax z11.h, p4/M, z11.h, z28.h\n"
+      "fmax z12.h, p4/M, z12.h, z28.h\n"
+      "fmax z13.h, p4/M, z13.h, z28.h\n"
+      "fmax z14.h, p4/M, z14.h, z28.h\n"
+      "fmax z15.h, p4/M, z15.h, z28.h\n"
+      "fmax z16.h, p4/M, z16.h, z28.h\n"
+      "fmax z17.h, p4/M, z17.h, z28.h\n"
+      "fmax z18.h, p4/M, z18.h, z28.h\n"
+      "fmax z19.h, p4/M, z19.h, z28.h\n"
+      "fmax z20.h, p4/M, z20.h, z28.h\n"
+      "fmax z21.h, p4/M, z21.h, z28.h\n"
+      "fmax z22.h, p4/M, z22.h, z28.h\n"
+      "fmax z23.h, p4/M, z23.h, z28.h\n"
+      "fmax z24.h, p4/M, z24.h, z28.h\n"
+      "fmax z25.h, p4/M, z25.h, z28.h\n"
+      "fmax z26.h, p4/M, z26.h, z28.h\n"
+      "fmax z27.h, p4/M, z27.h, z28.h\n"
       "59:"  // Height 5: No activation
       "st1h { z8.h }, p3, [x9]\n"
       "st1h { z9.h }, p2, [x9, #1, MUL VL]\n"
@@ -1081,35 +1081,35 @@
       "63:"  // Height 6: no bias
       "tbz %x[flags], #0, 64f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "ld1h { z8.h }, p3/Z, [x9]\n"
+      "add x24, x9, x20, LSL #1\n"
       "add x23, x24, x20, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x9]\n"
       "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
       "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1h { z12.h }, p3/Z, [x25]\n"
-      "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p3/Z, [x24]\n"
-      "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z20.h }, p3/Z, [x23]\n"
-      "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n"
-      "ld1h { z24.h }, p3/Z, [x22]\n"
-      "ld1h { z25.h }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1h { z26.h }, p1/Z, [x22, #2, MUL VL]\n"
-      "ld1h { z27.h }, p0/Z, [x22, #3, MUL VL]\n"
-      "ld1h { z28.h }, p3/Z, [x21]\n"
-      "ld1h { z29.h }, p2/Z, [x21, #1, MUL VL]\n"
-      "ld1h { z30.h }, p1/Z, [x21, #2, MUL VL]\n"
-      "ld1h { z31.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x24]\n"
+      "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x23]\n"
+      "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x22]\n"
+      "ld1h { z21.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z24.h }, p3/Z, [x21]\n"
+      "ld1h { z25.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z26.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z27.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z28.h }, p3/Z, [x20]\n"
+      "ld1h { z29.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z30.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z31.h }, p0/Z, [x20, #3, MUL VL]\n"
       "b 65f\n"
       "64:"  // Height 6: no accumulate
       "mov z8.b, #0x0\n"
@@ -1141,16 +1141,16 @@
       "66:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 67f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 68f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1162,11 +1162,11 @@
       "b 68f\n"
       "67:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "68:"  // Height 6: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rh { z0.h }, p4/Z, [x26]\n"
@@ -1355,7 +1355,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "74:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1363,4 +1362,4 @@
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
index 6f0b3e0..041825d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
@@ -139,11 +139,11 @@
       "6:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 7f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 8f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -156,164 +156,164 @@
       "9:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10]\n"
+      "fmla z8.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z8.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z10.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z11.h, z16.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[2]\n"
+      "fmla z9.h, z16.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[2]\n"
+      "fmla z11.h, z16.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[3]\n"
+      "fmla z9.h, z16.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[3]\n"
+      "fmla z11.h, z16.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[4]\n"
+      "fmla z9.h, z16.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[4]\n"
+      "fmla z11.h, z16.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[5]\n"
+      "fmla z9.h, z16.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[5]\n"
+      "fmla z11.h, z16.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[6]\n"
+      "fmla z9.h, z16.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[6]\n"
+      "fmla z11.h, z16.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[7]\n"
+      "fmla z9.h, z16.h, z0.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
       "sub x27, x27, #0x8\n"
       "cmp x27, #0x8\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z10.h, z17.h, z0.h[7]\n"
+      "fmla z11.h, z16.h, z0.h[7]\n"
       "add x26, x26, #0x10\n"
       "bgt 9b\n"
       "10:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10]\n"
+      "fmla z8.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.h, z16.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[0]\n"
+      "fmla z11.h, z16.h, z0.h[0]\n"
       "addvl x10, x10, #4\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[1]\n"
+      "fmla z9.h, z16.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z10.h, z17.h, z0.h[1]\n"
+      "fmla z11.h, z16.h, z0.h[1]\n"
       "addvl x10, x10, #4\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[2]\n"
+      "fmla z9.h, z16.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z10.h, z17.h, z0.h[2]\n"
+      "fmla z11.h, z16.h, z0.h[2]\n"
       "addvl x10, x10, #4\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[3]\n"
+      "fmla z9.h, z16.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z10.h, z17.h, z0.h[3]\n"
+      "fmla z11.h, z16.h, z0.h[3]\n"
       "addvl x10, x10, #4\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[4]\n"
+      "fmla z9.h, z16.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z10.h, z17.h, z0.h[4]\n"
+      "fmla z11.h, z16.h, z0.h[4]\n"
       "addvl x10, x10, #4\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[5]\n"
+      "fmla z9.h, z16.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z10.h, z17.h, z0.h[5]\n"
+      "fmla z11.h, z16.h, z0.h[5]\n"
       "addvl x10, x10, #4\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[6]\n"
+      "fmla z9.h, z16.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z10.h, z17.h, z0.h[6]\n"
+      "fmla z11.h, z16.h, z0.h[6]\n"
       "addvl x10, x10, #4\n"
       "ble 11f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[7]\n"
+      "fmla z9.h, z16.h, z0.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[7]\n"
+      "fmla z11.h, z16.h, z0.h[7]\n"
       "addvl x10, x10, #4\n"
       "11:"  // Height 1: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -322,17 +322,17 @@
       "bne 6b\n"
       "tbz %x[flags], #1, 12f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p5/Z, [x20]\n"
+      "ld1rh { z17.h }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p5/Z, [x20]\n"
-      "fmin z8.h, p5/M, z8.h, z1.h\n"
-      "fmin z9.h, p5/M, z9.h, z1.h\n"
-      "fmin z10.h, p5/M, z10.h, z1.h\n"
-      "fmin z11.h, p5/M, z11.h, z1.h\n"
-      "fmax z8.h, p5/M, z8.h, z0.h\n"
-      "fmax z9.h, p5/M, z9.h, z0.h\n"
-      "fmax z10.h, p5/M, z10.h, z0.h\n"
-      "fmax z11.h, p5/M, z11.h, z0.h\n"
+      "ld1rh { z16.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z17.h\n"
+      "fmin z9.h, p5/M, z9.h, z17.h\n"
+      "fmin z10.h, p5/M, z10.h, z17.h\n"
+      "fmin z11.h, p5/M, z11.h, z17.h\n"
+      "fmax z8.h, p5/M, z8.h, z16.h\n"
+      "fmax z9.h, p5/M, z9.h, z16.h\n"
+      "fmax z10.h, p5/M, z10.h, z16.h\n"
+      "fmax z11.h, p5/M, z11.h, z16.h\n"
       "12:"  // Height 1: No activation
       "st1h { z8.h }, p4, [x9]\n"
       "st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
@@ -372,15 +372,15 @@
       "16:"  // Height 2: no bias
       "tbz %x[flags], #0, 17f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #1\n"
+      "add x20, x9, x20, LSL #1\n"
       "ld1h { z8.h }, p4/Z, [x9]\n"
       "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1h { z12.h }, p4/Z, [x25]\n"
-      "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x20]\n"
+      "ld1h { z13.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x20, #3, MUL VL]\n"
       "b 18f\n"
       "17:"  // Height 2: no accumulate
       "mov z8.b, #0x0\n"
@@ -396,12 +396,12 @@
       "19:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 21f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -409,239 +409,239 @@
       "b 21f\n"
       "20:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
       "21:"  // Height 2: input setup done
       "cmp x27, #0x8\n"
       "ble 23f\n"
       "22:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z1.h }, p0/Z, [x26]\n"
+      "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[0]\n"
+      "fmla z12.h, z17.h, z0.h[0]\n"
+      "fmla z9.h, z16.h, z1.h[0]\n"
+      "fmla z13.h, z16.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[0]\n"
+      "fmla z14.h, z17.h, z0.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
       "cmp x27, #0x8\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[0]\n"
+      "fmla z15.h, z16.h, z0.h[0]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[1]\n"
+      "fmla z12.h, z17.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[1]\n"
+      "fmla z13.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[1]\n"
+      "fmla z14.h, z17.h, z0.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[1]\n"
+      "fmla z15.h, z16.h, z0.h[1]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[2]\n"
+      "fmla z12.h, z17.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[2]\n"
+      "fmla z13.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[2]\n"
+      "fmla z14.h, z17.h, z0.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[2]\n"
+      "fmla z15.h, z16.h, z0.h[2]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[3]\n"
+      "fmla z12.h, z17.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[3]\n"
+      "fmla z13.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[3]\n"
+      "fmla z14.h, z17.h, z0.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "fmla z11.h, z16.h, z1.h[3]\n"
+      "fmla z15.h, z16.h, z0.h[3]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[4]\n"
+      "fmla z12.h, z17.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[4]\n"
+      "fmla z13.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[4]\n"
+      "fmla z14.h, z17.h, z0.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[4]\n"
+      "fmla z15.h, z16.h, z0.h[4]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[5]\n"
+      "fmla z12.h, z17.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[5]\n"
+      "fmla z13.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z10.h, z17.h, z1.h[5]\n"
+      "fmla z14.h, z17.h, z0.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[5]\n"
+      "fmla z15.h, z16.h, z0.h[5]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[6]\n"
+      "fmla z12.h, z17.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[6]\n"
+      "fmla z13.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[6]\n"
+      "fmla z14.h, z17.h, z0.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.h, z16.h, z1.h[6]\n"
+      "fmla z15.h, z16.h, z0.h[6]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z17.h, z1.h[7]\n"
+      "fmla z12.h, z17.h, z0.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.h, z16.h, z1.h[7]\n"
+      "fmla z13.h, z16.h, z0.h[7]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z17.h, z1.h[7]\n"
+      "fmla z14.h, z17.h, z0.h[7]\n"
+      "fmla z11.h, z16.h, z1.h[7]\n"
+      "fmla z15.h, z16.h, z0.h[7]\n"
       "bgt 22b\n"
       "23:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
       "ld1rqh { z0.h }, p0/Z, [x26]\n"
       "ld1rqh { z1.h }, p0/Z, [x25]\n"
       "subs x27, x27, #0x1\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[0]\n"
+      "fmla z12.h, z17.h, z1.h[0]\n"
+      "fmla z9.h, z16.h, z0.h[0]\n"
+      "fmla z13.h, z16.h, z1.h[0]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[0]\n"
+      "fmla z14.h, z17.h, z1.h[0]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z11.h, z16.h, z0.h[0]\n"
+      "fmla z15.h, z16.h, z1.h[0]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[1]\n"
+      "fmla z12.h, z17.h, z1.h[1]\n"
+      "fmla z9.h, z16.h, z0.h[1]\n"
+      "fmla z13.h, z16.h, z1.h[1]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z10.h, z17.h, z0.h[1]\n"
+      "fmla z14.h, z17.h, z1.h[1]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z11.h, z16.h, z0.h[1]\n"
+      "fmla z15.h, z16.h, z1.h[1]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[2]\n"
+      "fmla z12.h, z17.h, z1.h[2]\n"
+      "fmla z9.h, z16.h, z0.h[2]\n"
+      "fmla z13.h, z16.h, z1.h[2]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z10.h, z17.h, z0.h[2]\n"
+      "fmla z14.h, z17.h, z1.h[2]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z11.h, z16.h, z0.h[2]\n"
+      "fmla z15.h, z16.h, z1.h[2]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[3]\n"
+      "fmla z12.h, z17.h, z1.h[3]\n"
+      "fmla z9.h, z16.h, z0.h[3]\n"
+      "fmla z13.h, z16.h, z1.h[3]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z10.h, z17.h, z0.h[3]\n"
+      "fmla z14.h, z17.h, z1.h[3]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z11.h, z16.h, z0.h[3]\n"
+      "fmla z15.h, z16.h, z1.h[3]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[4]\n"
+      "fmla z12.h, z17.h, z1.h[4]\n"
+      "fmla z9.h, z16.h, z0.h[4]\n"
+      "fmla z13.h, z16.h, z1.h[4]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z10.h, z17.h, z0.h[4]\n"
+      "fmla z14.h, z17.h, z1.h[4]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z11.h, z16.h, z0.h[4]\n"
+      "fmla z15.h, z16.h, z1.h[4]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[5]\n"
+      "fmla z12.h, z17.h, z1.h[5]\n"
+      "fmla z9.h, z16.h, z0.h[5]\n"
+      "fmla z13.h, z16.h, z1.h[5]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z10.h, z17.h, z0.h[5]\n"
+      "fmla z14.h, z17.h, z1.h[5]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z11.h, z16.h, z0.h[5]\n"
+      "fmla z15.h, z16.h, z1.h[5]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[6]\n"
+      "fmla z12.h, z17.h, z1.h[6]\n"
+      "fmla z9.h, z16.h, z0.h[6]\n"
+      "fmla z13.h, z16.h, z1.h[6]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z10.h, z17.h, z0.h[6]\n"
+      "fmla z14.h, z17.h, z1.h[6]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z11.h, z16.h, z0.h[6]\n"
+      "fmla z15.h, z16.h, z1.h[6]\n"
       "ble 24f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z17.h, z0.h[7]\n"
+      "fmla z12.h, z17.h, z1.h[7]\n"
+      "fmla z9.h, z16.h, z0.h[7]\n"
+      "fmla z13.h, z16.h, z1.h[7]\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z17.h, z0.h[7]\n"
+      "fmla z14.h, z17.h, z1.h[7]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z11.h, z16.h, z0.h[7]\n"
+      "fmla z15.h, z16.h, z1.h[7]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -651,25 +651,25 @@
       "add x25, x9, x20, LSL #1\n"
       "tbz %x[flags], #1, 25f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p5/Z, [x20]\n"
+      "ld1rh { z17.h }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p5/Z, [x20]\n"
-      "fmin z8.h, p5/M, z8.h, z1.h\n"
-      "fmin z9.h, p5/M, z9.h, z1.h\n"
-      "fmin z10.h, p5/M, z10.h, z1.h\n"
-      "fmin z11.h, p5/M, z11.h, z1.h\n"
-      "fmin z12.h, p5/M, z12.h, z1.h\n"
-      "fmin z13.h, p5/M, z13.h, z1.h\n"
-      "fmin z14.h, p5/M, z14.h, z1.h\n"
-      "fmin z15.h, p5/M, z15.h, z1.h\n"
-      "fmax z8.h, p5/M, z8.h, z0.h\n"
-      "fmax z9.h, p5/M, z9.h, z0.h\n"
-      "fmax z10.h, p5/M, z10.h, z0.h\n"
-      "fmax z11.h, p5/M, z11.h, z0.h\n"
-      "fmax z12.h, p5/M, z12.h, z0.h\n"
-      "fmax z13.h, p5/M, z13.h, z0.h\n"
-      "fmax z14.h, p5/M, z14.h, z0.h\n"
-      "fmax z15.h, p5/M, z15.h, z0.h\n"
+      "ld1rh { z16.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z17.h\n"
+      "fmin z9.h, p5/M, z9.h, z17.h\n"
+      "fmin z10.h, p5/M, z10.h, z17.h\n"
+      "fmin z11.h, p5/M, z11.h, z17.h\n"
+      "fmin z12.h, p5/M, z12.h, z17.h\n"
+      "fmin z13.h, p5/M, z13.h, z17.h\n"
+      "fmin z14.h, p5/M, z14.h, z17.h\n"
+      "fmin z15.h, p5/M, z15.h, z17.h\n"
+      "fmax z8.h, p5/M, z8.h, z16.h\n"
+      "fmax z9.h, p5/M, z9.h, z16.h\n"
+      "fmax z10.h, p5/M, z10.h, z16.h\n"
+      "fmax z11.h, p5/M, z11.h, z16.h\n"
+      "fmax z12.h, p5/M, z12.h, z16.h\n"
+      "fmax z13.h, p5/M, z13.h, z16.h\n"
+      "fmax z14.h, p5/M, z14.h, z16.h\n"
+      "fmax z15.h, p5/M, z15.h, z16.h\n"
       "25:"  // Height 2: No activation
       "st1h { z8.h }, p4, [x9]\n"
       "st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
@@ -717,20 +717,20 @@
       "29:"  // Height 3: no bias
       "tbz %x[flags], #0, 30f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x21, x9, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z8.h }, p4/Z, [x9]\n"
       "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1h { z12.h }, p4/Z, [x25]\n"
-      "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p4/Z, [x24]\n"
-      "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x21]\n"
+      "ld1h { z13.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x20]\n"
+      "ld1h { z17.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x20, #3, MUL VL]\n"
       "b 31f\n"
       "30:"  // Height 3: no accumulate
       "mov z8.b, #0x0\n"
@@ -750,13 +750,13 @@
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 34f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -765,151 +765,151 @@
       "b 34f\n"
       "33:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
       "34:"  // Height 3: input setup done
       "cmp x27, #0x8\n"
       "ble 36f\n"
       "35:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
       "ld1rqh { z1.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
-      "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1rqh { z0.h }, p0/Z, [x24]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "fmla z8.h, z21.h, z2.h[0]\n"
+      "fmla z12.h, z21.h, z1.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.h, z21.h, z0.h[0]\n"
+      "fmla z9.h, z20.h, z2.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[0]\n"
+      "fmla z17.h, z20.h, z0.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "cmp x27, #0x8\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z10.h, z21.h, z2.h[0]\n"
+      "fmla z14.h, z21.h, z1.h[0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z18.h, z21.h, z0.h[0]\n"
+      "fmla z11.h, z20.h, z2.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[0]\n"
+      "fmla z19.h, z20.h, z0.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[1]\n"
+      "fmla z12.h, z21.h, z1.h[1]\n"
+      "fmla z16.h, z21.h, z0.h[1]\n"
+      "fmla z9.h, z20.h, z2.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[1]\n"
+      "fmla z17.h, z20.h, z0.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[1]\n"
+      "fmla z14.h, z21.h, z1.h[1]\n"
+      "fmla z18.h, z21.h, z0.h[1]\n"
+      "fmla z11.h, z20.h, z2.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[1]\n"
+      "fmla z19.h, z20.h, z0.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[2]\n"
+      "fmla z12.h, z21.h, z1.h[2]\n"
+      "fmla z16.h, z21.h, z0.h[2]\n"
+      "fmla z9.h, z20.h, z2.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[2]\n"
+      "fmla z17.h, z20.h, z0.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[2]\n"
+      "fmla z14.h, z21.h, z1.h[2]\n"
+      "fmla z18.h, z21.h, z0.h[2]\n"
+      "fmla z11.h, z20.h, z2.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[2]\n"
+      "fmla z19.h, z20.h, z0.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[3]\n"
+      "fmla z12.h, z21.h, z1.h[3]\n"
+      "fmla z16.h, z21.h, z0.h[3]\n"
+      "fmla z9.h, z20.h, z2.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[3]\n"
+      "fmla z17.h, z20.h, z0.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[3]\n"
+      "fmla z14.h, z21.h, z1.h[3]\n"
+      "fmla z18.h, z21.h, z0.h[3]\n"
+      "fmla z11.h, z20.h, z2.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "fmla z15.h, z20.h, z1.h[3]\n"
+      "fmla z19.h, z20.h, z0.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[4]\n"
+      "fmla z12.h, z21.h, z1.h[4]\n"
+      "fmla z16.h, z21.h, z0.h[4]\n"
+      "fmla z9.h, z20.h, z2.h[4]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[4]\n"
+      "fmla z17.h, z20.h, z0.h[4]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[4]\n"
+      "fmla z14.h, z21.h, z1.h[4]\n"
+      "fmla z18.h, z21.h, z0.h[4]\n"
+      "fmla z11.h, z20.h, z2.h[4]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[4]\n"
+      "fmla z19.h, z20.h, z0.h[4]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[5]\n"
+      "fmla z12.h, z21.h, z1.h[5]\n"
+      "fmla z16.h, z21.h, z0.h[5]\n"
+      "fmla z9.h, z20.h, z2.h[5]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[5]\n"
+      "fmla z17.h, z20.h, z0.h[5]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z10.h, z21.h, z2.h[5]\n"
+      "fmla z14.h, z21.h, z1.h[5]\n"
+      "fmla z18.h, z21.h, z0.h[5]\n"
+      "fmla z11.h, z20.h, z2.h[5]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[5]\n"
+      "fmla z19.h, z20.h, z0.h[5]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[6]\n"
+      "fmla z12.h, z21.h, z1.h[6]\n"
+      "fmla z16.h, z21.h, z0.h[6]\n"
+      "fmla z9.h, z20.h, z2.h[6]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[6]\n"
+      "fmla z17.h, z20.h, z0.h[6]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[6]\n"
+      "fmla z14.h, z21.h, z1.h[6]\n"
+      "fmla z18.h, z21.h, z0.h[6]\n"
+      "fmla z11.h, z20.h, z2.h[6]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z15.h, z20.h, z1.h[6]\n"
+      "fmla z19.h, z20.h, z0.h[6]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z21.h, z2.h[7]\n"
+      "fmla z12.h, z21.h, z1.h[7]\n"
+      "fmla z16.h, z21.h, z0.h[7]\n"
+      "fmla z9.h, z20.h, z2.h[7]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[7]\n"
+      "fmla z17.h, z20.h, z0.h[7]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z21.h, z2.h[7]\n"
+      "fmla z14.h, z21.h, z1.h[7]\n"
+      "fmla z18.h, z21.h, z0.h[7]\n"
+      "fmla z11.h, z20.h, z2.h[7]\n"
+      "fmla z15.h, z20.h, z1.h[7]\n"
+      "fmla z19.h, z20.h, z0.h[7]\n"
       "bgt 35b\n"
       "36:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
@@ -917,155 +917,155 @@
       "ld1rqh { z1.h }, p0/Z, [x25]\n"
       "subs x27, x27, #0x1\n"
       "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "fmla z8.h, z21.h, z0.h[0]\n"
+      "fmla z12.h, z21.h, z1.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.h, z21.h, z2.h[0]\n"
+      "fmla z9.h, z20.h, z0.h[0]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[0]\n"
+      "fmla z17.h, z20.h, z2.h[0]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z10.h, z21.h, z0.h[0]\n"
+      "fmla z14.h, z21.h, z1.h[0]\n"
+      "fmla z18.h, z21.h, z2.h[0]\n"
+      "fmla z11.h, z20.h, z0.h[0]\n"
+      "fmla z15.h, z20.h, z1.h[0]\n"
+      "fmla z19.h, z20.h, z2.h[0]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z0.h[1]\n"
+      "fmla z12.h, z21.h, z1.h[1]\n"
+      "fmla z16.h, z21.h, z2.h[1]\n"
+      "fmla z9.h, z20.h, z0.h[1]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[1]\n"
+      "fmla z17.h, z20.h, z2.h[1]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z10.h, z21.h, z0.h[1]\n"
+      "fmla z14.h, z21.h, z1.h[1]\n"
+      "fmla z18.h, z21.h, z2.h[1]\n"
+      "fmla z11.h, z20.h, z0.h[1]\n"
+      "fmla z15.h, z20.h, z1.h[1]\n"
+      "fmla z19.h, z20.h, z2.h[1]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z0.h[2]\n"
+      "fmla z12.h, z21.h, z1.h[2]\n"
+      "fmla z16.h, z21.h, z2.h[2]\n"
+      "fmla z9.h, z20.h, z0.h[2]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[2]\n"
+      "fmla z17.h, z20.h, z2.h[2]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z10.h, z21.h, z0.h[2]\n"
+      "fmla z14.h, z21.h, z1.h[2]\n"
+      "fmla z18.h, z21.h, z2.h[2]\n"
+      "fmla z11.h, z20.h, z0.h[2]\n"
+      "fmla z15.h, z20.h, z1.h[2]\n"
+      "fmla z19.h, z20.h, z2.h[2]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z0.h[3]\n"
+      "fmla z12.h, z21.h, z1.h[3]\n"
+      "fmla z16.h, z21.h, z2.h[3]\n"
+      "fmla z9.h, z20.h, z0.h[3]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[3]\n"
+      "fmla z17.h, z20.h, z2.h[3]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z10.h, z21.h, z0.h[3]\n"
+      "fmla z14.h, z21.h, z1.h[3]\n"
+      "fmla z18.h, z21.h, z2.h[3]\n"
+      "fmla z11.h, z20.h, z0.h[3]\n"
+      "fmla z15.h, z20.h, z1.h[3]\n"
+      "fmla z19.h, z20.h, z2.h[3]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z0.h[4]\n"
+      "fmla z12.h, z21.h, z1.h[4]\n"
+      "fmla z16.h, z21.h, z2.h[4]\n"
+      "fmla z9.h, z20.h, z0.h[4]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[4]\n"
+      "fmla z17.h, z20.h, z2.h[4]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z10.h, z21.h, z0.h[4]\n"
+      "fmla z14.h, z21.h, z1.h[4]\n"
+      "fmla z18.h, z21.h, z2.h[4]\n"
+      "fmla z11.h, z20.h, z0.h[4]\n"
+      "fmla z15.h, z20.h, z1.h[4]\n"
+      "fmla z19.h, z20.h, z2.h[4]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z0.h[5]\n"
+      "fmla z12.h, z21.h, z1.h[5]\n"
+      "fmla z16.h, z21.h, z2.h[5]\n"
+      "fmla z9.h, z20.h, z0.h[5]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[5]\n"
+      "fmla z17.h, z20.h, z2.h[5]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z10.h, z21.h, z0.h[5]\n"
+      "fmla z14.h, z21.h, z1.h[5]\n"
+      "fmla z18.h, z21.h, z2.h[5]\n"
+      "fmla z11.h, z20.h, z0.h[5]\n"
+      "fmla z15.h, z20.h, z1.h[5]\n"
+      "fmla z19.h, z20.h, z2.h[5]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z0.h[6]\n"
+      "fmla z12.h, z21.h, z1.h[6]\n"
+      "fmla z16.h, z21.h, z2.h[6]\n"
+      "fmla z9.h, z20.h, z0.h[6]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[6]\n"
+      "fmla z17.h, z20.h, z2.h[6]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z10.h, z21.h, z0.h[6]\n"
+      "fmla z14.h, z21.h, z1.h[6]\n"
+      "fmla z18.h, z21.h, z2.h[6]\n"
+      "fmla z11.h, z20.h, z0.h[6]\n"
+      "fmla z15.h, z20.h, z1.h[6]\n"
+      "fmla z19.h, z20.h, z2.h[6]\n"
       "ble 37f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z21.h }, p5/Z, [x10]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z21.h, z0.h[7]\n"
+      "fmla z12.h, z21.h, z1.h[7]\n"
+      "fmla z16.h, z21.h, z2.h[7]\n"
+      "fmla z9.h, z20.h, z0.h[7]\n"
+      "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z20.h, z1.h[7]\n"
+      "fmla z17.h, z20.h, z2.h[7]\n"
+      "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z10.h, z21.h, z0.h[7]\n"
+      "fmla z14.h, z21.h, z1.h[7]\n"
+      "fmla z18.h, z21.h, z2.h[7]\n"
+      "fmla z11.h, z20.h, z0.h[7]\n"
+      "fmla z15.h, z20.h, z1.h[7]\n"
+      "fmla z19.h, z20.h, z2.h[7]\n"
       "37:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1076,33 +1076,33 @@
       "add x24, x25, x20, LSL #1\n"
       "tbz %x[flags], #1, 38f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p5/Z, [x20]\n"
+      "ld1rh { z21.h }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p5/Z, [x20]\n"
-      "fmin z8.h, p5/M, z8.h, z1.h\n"
-      "fmin z9.h, p5/M, z9.h, z1.h\n"
-      "fmin z10.h, p5/M, z10.h, z1.h\n"
-      "fmin z11.h, p5/M, z11.h, z1.h\n"
-      "fmin z12.h, p5/M, z12.h, z1.h\n"
-      "fmin z13.h, p5/M, z13.h, z1.h\n"
-      "fmin z14.h, p5/M, z14.h, z1.h\n"
-      "fmin z15.h, p5/M, z15.h, z1.h\n"
-      "fmin z16.h, p5/M, z16.h, z1.h\n"
-      "fmin z17.h, p5/M, z17.h, z1.h\n"
-      "fmin z18.h, p5/M, z18.h, z1.h\n"
-      "fmin z19.h, p5/M, z19.h, z1.h\n"
-      "fmax z8.h, p5/M, z8.h, z0.h\n"
-      "fmax z9.h, p5/M, z9.h, z0.h\n"
-      "fmax z10.h, p5/M, z10.h, z0.h\n"
-      "fmax z11.h, p5/M, z11.h, z0.h\n"
-      "fmax z12.h, p5/M, z12.h, z0.h\n"
-      "fmax z13.h, p5/M, z13.h, z0.h\n"
-      "fmax z14.h, p5/M, z14.h, z0.h\n"
-      "fmax z15.h, p5/M, z15.h, z0.h\n"
-      "fmax z16.h, p5/M, z16.h, z0.h\n"
-      "fmax z17.h, p5/M, z17.h, z0.h\n"
-      "fmax z18.h, p5/M, z18.h, z0.h\n"
-      "fmax z19.h, p5/M, z19.h, z0.h\n"
+      "ld1rh { z20.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z21.h\n"
+      "fmin z9.h, p5/M, z9.h, z21.h\n"
+      "fmin z10.h, p5/M, z10.h, z21.h\n"
+      "fmin z11.h, p5/M, z11.h, z21.h\n"
+      "fmin z12.h, p5/M, z12.h, z21.h\n"
+      "fmin z13.h, p5/M, z13.h, z21.h\n"
+      "fmin z14.h, p5/M, z14.h, z21.h\n"
+      "fmin z15.h, p5/M, z15.h, z21.h\n"
+      "fmin z16.h, p5/M, z16.h, z21.h\n"
+      "fmin z17.h, p5/M, z17.h, z21.h\n"
+      "fmin z18.h, p5/M, z18.h, z21.h\n"
+      "fmin z19.h, p5/M, z19.h, z21.h\n"
+      "fmax z8.h, p5/M, z8.h, z20.h\n"
+      "fmax z9.h, p5/M, z9.h, z20.h\n"
+      "fmax z10.h, p5/M, z10.h, z20.h\n"
+      "fmax z11.h, p5/M, z11.h, z20.h\n"
+      "fmax z12.h, p5/M, z12.h, z20.h\n"
+      "fmax z13.h, p5/M, z13.h, z20.h\n"
+      "fmax z14.h, p5/M, z14.h, z20.h\n"
+      "fmax z15.h, p5/M, z15.h, z20.h\n"
+      "fmax z16.h, p5/M, z16.h, z20.h\n"
+      "fmax z17.h, p5/M, z17.h, z20.h\n"
+      "fmax z18.h, p5/M, z18.h, z20.h\n"
+      "fmax z19.h, p5/M, z19.h, z20.h\n"
       "38:"  // Height 3: No activation
       "st1h { z8.h }, p4, [x9]\n"
       "st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
@@ -1158,25 +1158,25 @@
       "42:"  // Height 4: no bias
       "tbz %x[flags], #0, 43f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
+      "add x22, x9, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
       "ld1h { z8.h }, p4/Z, [x9]\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1h { z12.h }, p4/Z, [x25]\n"
-      "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p4/Z, [x24]\n"
-      "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z20.h }, p4/Z, [x23]\n"
-      "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x22]\n"
+      "ld1h { z13.h }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x21]\n"
+      "ld1h { z17.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x20]\n"
+      "ld1h { z21.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x20, #3, MUL VL]\n"
       "b 44f\n"
       "43:"  // Height 4: no accumulate
       "mov z8.b, #0x0\n"
@@ -1200,14 +1200,14 @@
       "45:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 46f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1217,186 +1217,186 @@
       "b 47f\n"
       "46:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
       "47:"  // Height 4: input setup done
       "cmp x27, #0x8\n"
       "ble 49f\n"
       "48:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z3.h }, p0/Z, [x26]\n"
+      "ld1rqh { z2.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
-      "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z1.h }, p0/Z, [x24]\n"
+      "ld1rqh { z0.h }, p0/Z, [x23]\n"
       "cmp x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[0]\n"
+      "fmla z12.h, z25.h, z2.h[0]\n"
+      "fmla z16.h, z25.h, z1.h[0]\n"
+      "fmla z20.h, z25.h, z0.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
+      "fmla z9.h, z24.h, z3.h[0]\n"
+      "fmla z13.h, z24.h, z2.h[0]\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z17.h, z24.h, z1.h[0]\n"
+      "fmla z21.h, z24.h, z0.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[0]\n"
+      "fmla z14.h, z25.h, z2.h[0]\n"
+      "fmla z18.h, z25.h, z1.h[0]\n"
+      "fmla z22.h, z25.h, z0.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[0]\n"
+      "fmla z15.h, z24.h, z2.h[0]\n"
+      "fmla z19.h, z24.h, z1.h[0]\n"
+      "fmla z23.h, z24.h, z0.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[1]\n"
+      "fmla z12.h, z25.h, z2.h[1]\n"
+      "fmla z16.h, z25.h, z1.h[1]\n"
+      "fmla z20.h, z25.h, z0.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[1]\n"
+      "fmla z13.h, z24.h, z2.h[1]\n"
+      "fmla z17.h, z24.h, z1.h[1]\n"
+      "fmla z21.h, z24.h, z0.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[1]\n"
+      "fmla z14.h, z25.h, z2.h[1]\n"
+      "fmla z18.h, z25.h, z1.h[1]\n"
+      "fmla z22.h, z25.h, z0.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[1]\n"
+      "fmla z15.h, z24.h, z2.h[1]\n"
+      "fmla z19.h, z24.h, z1.h[1]\n"
+      "fmla z23.h, z24.h, z0.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[2]\n"
+      "fmla z12.h, z25.h, z2.h[2]\n"
+      "fmla z16.h, z25.h, z1.h[2]\n"
+      "fmla z20.h, z25.h, z0.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[2]\n"
+      "fmla z13.h, z24.h, z2.h[2]\n"
+      "fmla z17.h, z24.h, z1.h[2]\n"
+      "fmla z21.h, z24.h, z0.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[2]\n"
+      "fmla z14.h, z25.h, z2.h[2]\n"
+      "fmla z18.h, z25.h, z1.h[2]\n"
+      "fmla z22.h, z25.h, z0.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[2]\n"
+      "fmla z15.h, z24.h, z2.h[2]\n"
+      "fmla z19.h, z24.h, z1.h[2]\n"
+      "fmla z23.h, z24.h, z0.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[3]\n"
+      "fmla z12.h, z25.h, z2.h[3]\n"
+      "fmla z16.h, z25.h, z1.h[3]\n"
+      "fmla z20.h, z25.h, z0.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[3]\n"
+      "fmla z13.h, z24.h, z2.h[3]\n"
+      "fmla z17.h, z24.h, z1.h[3]\n"
+      "fmla z21.h, z24.h, z0.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[3]\n"
+      "fmla z14.h, z25.h, z2.h[3]\n"
+      "fmla z18.h, z25.h, z1.h[3]\n"
+      "fmla z22.h, z25.h, z0.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "fmla z11.h, z24.h, z3.h[3]\n"
+      "fmla z15.h, z24.h, z2.h[3]\n"
+      "fmla z19.h, z24.h, z1.h[3]\n"
+      "fmla z23.h, z24.h, z0.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[4]\n"
+      "fmla z12.h, z25.h, z2.h[4]\n"
+      "fmla z16.h, z25.h, z1.h[4]\n"
+      "fmla z20.h, z25.h, z0.h[4]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[4]\n"
+      "fmla z13.h, z24.h, z2.h[4]\n"
+      "fmla z17.h, z24.h, z1.h[4]\n"
+      "fmla z21.h, z24.h, z0.h[4]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[4]\n"
+      "fmla z14.h, z25.h, z2.h[4]\n"
+      "fmla z18.h, z25.h, z1.h[4]\n"
+      "fmla z22.h, z25.h, z0.h[4]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[4]\n"
+      "fmla z15.h, z24.h, z2.h[4]\n"
+      "fmla z19.h, z24.h, z1.h[4]\n"
+      "fmla z23.h, z24.h, z0.h[4]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[5]\n"
+      "fmla z12.h, z25.h, z2.h[5]\n"
+      "fmla z16.h, z25.h, z1.h[5]\n"
+      "fmla z20.h, z25.h, z0.h[5]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[5]\n"
+      "fmla z13.h, z24.h, z2.h[5]\n"
+      "fmla z17.h, z24.h, z1.h[5]\n"
+      "fmla z21.h, z24.h, z0.h[5]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
+      "fmla z10.h, z25.h, z3.h[5]\n"
+      "fmla z14.h, z25.h, z2.h[5]\n"
+      "fmla z18.h, z25.h, z1.h[5]\n"
+      "fmla z22.h, z25.h, z0.h[5]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[5]\n"
+      "fmla z15.h, z24.h, z2.h[5]\n"
+      "fmla z19.h, z24.h, z1.h[5]\n"
+      "fmla z23.h, z24.h, z0.h[5]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[6]\n"
+      "fmla z12.h, z25.h, z2.h[6]\n"
+      "fmla z16.h, z25.h, z1.h[6]\n"
+      "fmla z20.h, z25.h, z0.h[6]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[6]\n"
+      "fmla z13.h, z24.h, z2.h[6]\n"
+      "fmla z17.h, z24.h, z1.h[6]\n"
+      "fmla z21.h, z24.h, z0.h[6]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[6]\n"
+      "fmla z14.h, z25.h, z2.h[6]\n"
+      "fmla z18.h, z25.h, z1.h[6]\n"
+      "fmla z22.h, z25.h, z0.h[6]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.h, z24.h, z3.h[6]\n"
+      "fmla z15.h, z24.h, z2.h[6]\n"
+      "fmla z19.h, z24.h, z1.h[6]\n"
+      "fmla z23.h, z24.h, z0.h[6]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z25.h, z3.h[7]\n"
+      "fmla z12.h, z25.h, z2.h[7]\n"
+      "fmla z16.h, z25.h, z1.h[7]\n"
+      "fmla z20.h, z25.h, z0.h[7]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.h, z24.h, z3.h[7]\n"
+      "fmla z13.h, z24.h, z2.h[7]\n"
+      "fmla z17.h, z24.h, z1.h[7]\n"
+      "fmla z21.h, z24.h, z0.h[7]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z25.h, z3.h[7]\n"
+      "fmla z14.h, z25.h, z2.h[7]\n"
+      "fmla z18.h, z25.h, z1.h[7]\n"
+      "fmla z22.h, z25.h, z0.h[7]\n"
+      "fmla z11.h, z24.h, z3.h[7]\n"
+      "fmla z15.h, z24.h, z2.h[7]\n"
+      "fmla z19.h, z24.h, z1.h[7]\n"
+      "fmla z23.h, z24.h, z0.h[7]\n"
       "bgt 48b\n"
       "49:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
@@ -1405,187 +1405,187 @@
       "subs x27, x27, #0x1\n"
       "ld1rqh { z2.h }, p0/Z, [x24]\n"
       "ld1rqh { z3.h }, p0/Z, [x23]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[0]\n"
+      "fmla z12.h, z25.h, z1.h[0]\n"
+      "fmla z16.h, z25.h, z2.h[0]\n"
+      "fmla z20.h, z25.h, z3.h[0]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z24.h, z0.h[0]\n"
+      "fmla z13.h, z24.h, z1.h[0]\n"
+      "fmla z17.h, z24.h, z2.h[0]\n"
+      "fmla z21.h, z24.h, z3.h[0]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
+      "fmla z10.h, z25.h, z0.h[0]\n"
+      "fmla z14.h, z25.h, z1.h[0]\n"
+      "fmla z18.h, z25.h, z2.h[0]\n"
+      "fmla z22.h, z25.h, z3.h[0]\n"
+      "fmla z11.h, z24.h, z0.h[0]\n"
+      "fmla z15.h, z24.h, z1.h[0]\n"
+      "fmla z19.h, z24.h, z2.h[0]\n"
+      "fmla z23.h, z24.h, z3.h[0]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[1]\n"
+      "fmla z12.h, z25.h, z1.h[1]\n"
+      "fmla z16.h, z25.h, z2.h[1]\n"
+      "fmla z20.h, z25.h, z3.h[1]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z9.h, z24.h, z0.h[1]\n"
+      "fmla z13.h, z24.h, z1.h[1]\n"
+      "fmla z17.h, z24.h, z2.h[1]\n"
+      "fmla z21.h, z24.h, z3.h[1]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
+      "fmla z10.h, z25.h, z0.h[1]\n"
+      "fmla z14.h, z25.h, z1.h[1]\n"
+      "fmla z18.h, z25.h, z2.h[1]\n"
+      "fmla z22.h, z25.h, z3.h[1]\n"
+      "fmla z11.h, z24.h, z0.h[1]\n"
+      "fmla z15.h, z24.h, z1.h[1]\n"
+      "fmla z19.h, z24.h, z2.h[1]\n"
+      "fmla z23.h, z24.h, z3.h[1]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[2]\n"
+      "fmla z12.h, z25.h, z1.h[2]\n"
+      "fmla z16.h, z25.h, z2.h[2]\n"
+      "fmla z20.h, z25.h, z3.h[2]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z9.h, z24.h, z0.h[2]\n"
+      "fmla z13.h, z24.h, z1.h[2]\n"
+      "fmla z17.h, z24.h, z2.h[2]\n"
+      "fmla z21.h, z24.h, z3.h[2]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
+      "fmla z10.h, z25.h, z0.h[2]\n"
+      "fmla z14.h, z25.h, z1.h[2]\n"
+      "fmla z18.h, z25.h, z2.h[2]\n"
+      "fmla z22.h, z25.h, z3.h[2]\n"
+      "fmla z11.h, z24.h, z0.h[2]\n"
+      "fmla z15.h, z24.h, z1.h[2]\n"
+      "fmla z19.h, z24.h, z2.h[2]\n"
+      "fmla z23.h, z24.h, z3.h[2]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[3]\n"
+      "fmla z12.h, z25.h, z1.h[3]\n"
+      "fmla z16.h, z25.h, z2.h[3]\n"
+      "fmla z20.h, z25.h, z3.h[3]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z9.h, z24.h, z0.h[3]\n"
+      "fmla z13.h, z24.h, z1.h[3]\n"
+      "fmla z17.h, z24.h, z2.h[3]\n"
+      "fmla z21.h, z24.h, z3.h[3]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
+      "fmla z10.h, z25.h, z0.h[3]\n"
+      "fmla z14.h, z25.h, z1.h[3]\n"
+      "fmla z18.h, z25.h, z2.h[3]\n"
+      "fmla z22.h, z25.h, z3.h[3]\n"
+      "fmla z11.h, z24.h, z0.h[3]\n"
+      "fmla z15.h, z24.h, z1.h[3]\n"
+      "fmla z19.h, z24.h, z2.h[3]\n"
+      "fmla z23.h, z24.h, z3.h[3]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[4]\n"
+      "fmla z12.h, z25.h, z1.h[4]\n"
+      "fmla z16.h, z25.h, z2.h[4]\n"
+      "fmla z20.h, z25.h, z3.h[4]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z9.h, z24.h, z0.h[4]\n"
+      "fmla z13.h, z24.h, z1.h[4]\n"
+      "fmla z17.h, z24.h, z2.h[4]\n"
+      "fmla z21.h, z24.h, z3.h[4]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
+      "fmla z10.h, z25.h, z0.h[4]\n"
+      "fmla z14.h, z25.h, z1.h[4]\n"
+      "fmla z18.h, z25.h, z2.h[4]\n"
+      "fmla z22.h, z25.h, z3.h[4]\n"
+      "fmla z11.h, z24.h, z0.h[4]\n"
+      "fmla z15.h, z24.h, z1.h[4]\n"
+      "fmla z19.h, z24.h, z2.h[4]\n"
+      "fmla z23.h, z24.h, z3.h[4]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[5]\n"
+      "fmla z12.h, z25.h, z1.h[5]\n"
+      "fmla z16.h, z25.h, z2.h[5]\n"
+      "fmla z20.h, z25.h, z3.h[5]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z9.h, z24.h, z0.h[5]\n"
+      "fmla z13.h, z24.h, z1.h[5]\n"
+      "fmla z17.h, z24.h, z2.h[5]\n"
+      "fmla z21.h, z24.h, z3.h[5]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
+      "fmla z10.h, z25.h, z0.h[5]\n"
+      "fmla z14.h, z25.h, z1.h[5]\n"
+      "fmla z18.h, z25.h, z2.h[5]\n"
+      "fmla z22.h, z25.h, z3.h[5]\n"
+      "fmla z11.h, z24.h, z0.h[5]\n"
+      "fmla z15.h, z24.h, z1.h[5]\n"
+      "fmla z19.h, z24.h, z2.h[5]\n"
+      "fmla z23.h, z24.h, z3.h[5]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[6]\n"
+      "fmla z12.h, z25.h, z1.h[6]\n"
+      "fmla z16.h, z25.h, z2.h[6]\n"
+      "fmla z20.h, z25.h, z3.h[6]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z9.h, z24.h, z0.h[6]\n"
+      "fmla z13.h, z24.h, z1.h[6]\n"
+      "fmla z17.h, z24.h, z2.h[6]\n"
+      "fmla z21.h, z24.h, z3.h[6]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
+      "fmla z10.h, z25.h, z0.h[6]\n"
+      "fmla z14.h, z25.h, z1.h[6]\n"
+      "fmla z18.h, z25.h, z2.h[6]\n"
+      "fmla z22.h, z25.h, z3.h[6]\n"
+      "fmla z11.h, z24.h, z0.h[6]\n"
+      "fmla z15.h, z24.h, z1.h[6]\n"
+      "fmla z19.h, z24.h, z2.h[6]\n"
+      "fmla z23.h, z24.h, z3.h[6]\n"
       "ble 50f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z25.h, z0.h[7]\n"
+      "fmla z12.h, z25.h, z1.h[7]\n"
+      "fmla z16.h, z25.h, z2.h[7]\n"
+      "fmla z20.h, z25.h, z3.h[7]\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z24.h, z0.h[7]\n"
+      "fmla z13.h, z24.h, z1.h[7]\n"
+      "fmla z17.h, z24.h, z2.h[7]\n"
+      "fmla z21.h, z24.h, z3.h[7]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
+      "fmla z10.h, z25.h, z0.h[7]\n"
+      "fmla z14.h, z25.h, z1.h[7]\n"
+      "fmla z18.h, z25.h, z2.h[7]\n"
+      "fmla z22.h, z25.h, z3.h[7]\n"
+      "fmla z11.h, z24.h, z0.h[7]\n"
+      "fmla z15.h, z24.h, z1.h[7]\n"
+      "fmla z19.h, z24.h, z2.h[7]\n"
+      "fmla z23.h, z24.h, z3.h[7]\n"
       "50:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1597,41 +1597,41 @@
       "add x23, x24, x20, LSL #1\n"
       "tbz %x[flags], #1, 51f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p5/Z, [x20]\n"
+      "ld1rh { z25.h }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p5/Z, [x20]\n"
-      "fmin z8.h, p5/M, z8.h, z1.h\n"
-      "fmin z9.h, p5/M, z9.h, z1.h\n"
-      "fmin z10.h, p5/M, z10.h, z1.h\n"
-      "fmin z11.h, p5/M, z11.h, z1.h\n"
-      "fmin z12.h, p5/M, z12.h, z1.h\n"
-      "fmin z13.h, p5/M, z13.h, z1.h\n"
-      "fmin z14.h, p5/M, z14.h, z1.h\n"
-      "fmin z15.h, p5/M, z15.h, z1.h\n"
-      "fmin z16.h, p5/M, z16.h, z1.h\n"
-      "fmin z17.h, p5/M, z17.h, z1.h\n"
-      "fmin z18.h, p5/M, z18.h, z1.h\n"
-      "fmin z19.h, p5/M, z19.h, z1.h\n"
-      "fmin z20.h, p5/M, z20.h, z1.h\n"
-      "fmin z21.h, p5/M, z21.h, z1.h\n"
-      "fmin z22.h, p5/M, z22.h, z1.h\n"
-      "fmin z23.h, p5/M, z23.h, z1.h\n"
-      "fmax z8.h, p5/M, z8.h, z0.h\n"
-      "fmax z9.h, p5/M, z9.h, z0.h\n"
-      "fmax z10.h, p5/M, z10.h, z0.h\n"
-      "fmax z11.h, p5/M, z11.h, z0.h\n"
-      "fmax z12.h, p5/M, z12.h, z0.h\n"
-      "fmax z13.h, p5/M, z13.h, z0.h\n"
-      "fmax z14.h, p5/M, z14.h, z0.h\n"
-      "fmax z15.h, p5/M, z15.h, z0.h\n"
-      "fmax z16.h, p5/M, z16.h, z0.h\n"
-      "fmax z17.h, p5/M, z17.h, z0.h\n"
-      "fmax z18.h, p5/M, z18.h, z0.h\n"
-      "fmax z19.h, p5/M, z19.h, z0.h\n"
-      "fmax z20.h, p5/M, z20.h, z0.h\n"
-      "fmax z21.h, p5/M, z21.h, z0.h\n"
-      "fmax z22.h, p5/M, z22.h, z0.h\n"
-      "fmax z23.h, p5/M, z23.h, z0.h\n"
+      "ld1rh { z24.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z25.h\n"
+      "fmin z9.h, p5/M, z9.h, z25.h\n"
+      "fmin z10.h, p5/M, z10.h, z25.h\n"
+      "fmin z11.h, p5/M, z11.h, z25.h\n"
+      "fmin z12.h, p5/M, z12.h, z25.h\n"
+      "fmin z13.h, p5/M, z13.h, z25.h\n"
+      "fmin z14.h, p5/M, z14.h, z25.h\n"
+      "fmin z15.h, p5/M, z15.h, z25.h\n"
+      "fmin z16.h, p5/M, z16.h, z25.h\n"
+      "fmin z17.h, p5/M, z17.h, z25.h\n"
+      "fmin z18.h, p5/M, z18.h, z25.h\n"
+      "fmin z19.h, p5/M, z19.h, z25.h\n"
+      "fmin z20.h, p5/M, z20.h, z25.h\n"
+      "fmin z21.h, p5/M, z21.h, z25.h\n"
+      "fmin z22.h, p5/M, z22.h, z25.h\n"
+      "fmin z23.h, p5/M, z23.h, z25.h\n"
+      "fmax z8.h, p5/M, z8.h, z24.h\n"
+      "fmax z9.h, p5/M, z9.h, z24.h\n"
+      "fmax z10.h, p5/M, z10.h, z24.h\n"
+      "fmax z11.h, p5/M, z11.h, z24.h\n"
+      "fmax z12.h, p5/M, z12.h, z24.h\n"
+      "fmax z13.h, p5/M, z13.h, z24.h\n"
+      "fmax z14.h, p5/M, z14.h, z24.h\n"
+      "fmax z15.h, p5/M, z15.h, z24.h\n"
+      "fmax z16.h, p5/M, z16.h, z24.h\n"
+      "fmax z17.h, p5/M, z17.h, z24.h\n"
+      "fmax z18.h, p5/M, z18.h, z24.h\n"
+      "fmax z19.h, p5/M, z19.h, z24.h\n"
+      "fmax z20.h, p5/M, z20.h, z24.h\n"
+      "fmax z21.h, p5/M, z21.h, z24.h\n"
+      "fmax z22.h, p5/M, z22.h, z24.h\n"
+      "fmax z23.h, p5/M, z23.h, z24.h\n"
       "51:"  // Height 4: No activation
       "st1h { z8.h }, p4, [x9]\n"
       "st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
@@ -1695,30 +1695,30 @@
       "55:"  // Height 5: no bias
       "tbz %x[flags], #0, 56f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "ld1h { z8.h }, p4/Z, [x9]\n"
-      "add x23, x24, x20, LSL #1\n"
+      "add x23, x9, x20, LSL #1\n"
       "add x22, x23, x20, LSL #1\n"
+      "ld1h { z8.h }, p4/Z, [x9]\n"
+      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1h { z12.h }, p4/Z, [x25]\n"
-      "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p4/Z, [x24]\n"
-      "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z20.h }, p4/Z, [x23]\n"
-      "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1h { z24.h }, p4/Z, [x22]\n"
-      "ld1h { z25.h }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1h { z26.h }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x23]\n"
+      "ld1h { z13.h }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x22]\n"
+      "ld1h { z17.h }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x21]\n"
+      "ld1h { z21.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x20]\n"
+      "ld1h { z25.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z26.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z27.h }, p1/Z, [x20, #3, MUL VL]\n"
       "b 57f\n"
       "56:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -1746,15 +1746,15 @@
       "58:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 59f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 60f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -1765,221 +1765,221 @@
       "b 60f\n"
       "59:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
       "60:"  // Height 5: input setup done
       "cmp x27, #0x8\n"
       "ble 62f\n"
       "61:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z4.h }, p0/Z, [x26]\n"
+      "ld1rqh { z3.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
       "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z1.h }, p0/Z, [x23]\n"
       "cmp x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      "ld1rqh { z4.h }, p0/Z, [x22]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
+      "ld1rqh { z0.h }, p0/Z, [x22]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "fmla z8.h, z29.h, z4.h[0]\n"
+      "fmla z12.h, z29.h, z3.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.h, z29.h, z2.h[0]\n"
+      "fmla z20.h, z29.h, z1.h[0]\n"
       "add x25, x25, #0x10\n"
-      "fmla z24.h, z6.h, z4.h[0]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z24.h, z29.h, z0.h[0]\n"
+      "fmla z9.h, z28.h, z4.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
+      "fmla z13.h, z28.h, z3.h[0]\n"
+      "fmla z17.h, z28.h, z2.h[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "fmla z25.h, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z26.h, z6.h, z4.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "fmla z27.h, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
-      "fmla z24.h, z6.h, z4.h[1]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "fmla z25.h, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z21.h, z28.h, z1.h[0]\n"
+      "fmla z25.h, z28.h, z0.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[0]\n"
+      "fmla z14.h, z29.h, z3.h[0]\n"
+      "fmla z18.h, z29.h, z2.h[0]\n"
+      "fmla z22.h, z29.h, z1.h[0]\n"
+      "fmla z26.h, z29.h, z0.h[0]\n"
+      "fmla z11.h, z28.h, z4.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[0]\n"
+      "fmla z19.h, z28.h, z2.h[0]\n"
+      "fmla z23.h, z28.h, z1.h[0]\n"
+      "fmla z27.h, z28.h, z0.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[1]\n"
+      "fmla z12.h, z29.h, z3.h[1]\n"
+      "fmla z16.h, z29.h, z2.h[1]\n"
+      "fmla z20.h, z29.h, z1.h[1]\n"
+      "fmla z24.h, z29.h, z0.h[1]\n"
+      "fmla z9.h, z28.h, z4.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[1]\n"
+      "fmla z17.h, z28.h, z2.h[1]\n"
+      "fmla z21.h, z28.h, z1.h[1]\n"
+      "fmla z25.h, z28.h, z0.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z26.h, z6.h, z4.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "fmla z27.h, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
-      "fmla z24.h, z6.h, z4.h[2]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "fmla z25.h, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z26.h, z6.h, z4.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "fmla z27.h, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
-      "fmla z24.h, z6.h, z4.h[3]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "fmla z25.h, z7.h, z4.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z26.h, z6.h, z4.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "fmla z27.h, z7.h, z4.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
-      "fmla z24.h, z6.h, z4.h[4]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "fmla z25.h, z7.h, z4.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z26.h, z6.h, z4.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "fmla z27.h, z7.h, z4.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
-      "fmla z24.h, z6.h, z4.h[5]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "fmla z25.h, z7.h, z4.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[1]\n"
+      "fmla z14.h, z29.h, z3.h[1]\n"
+      "fmla z18.h, z29.h, z2.h[1]\n"
+      "fmla z22.h, z29.h, z1.h[1]\n"
+      "fmla z26.h, z29.h, z0.h[1]\n"
+      "fmla z11.h, z28.h, z4.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[1]\n"
+      "fmla z19.h, z28.h, z2.h[1]\n"
+      "fmla z23.h, z28.h, z1.h[1]\n"
+      "fmla z27.h, z28.h, z0.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[2]\n"
+      "fmla z12.h, z29.h, z3.h[2]\n"
+      "fmla z16.h, z29.h, z2.h[2]\n"
+      "fmla z20.h, z29.h, z1.h[2]\n"
+      "fmla z24.h, z29.h, z0.h[2]\n"
+      "fmla z9.h, z28.h, z4.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[2]\n"
+      "fmla z17.h, z28.h, z2.h[2]\n"
+      "fmla z21.h, z28.h, z1.h[2]\n"
+      "fmla z25.h, z28.h, z0.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[2]\n"
+      "fmla z14.h, z29.h, z3.h[2]\n"
+      "fmla z18.h, z29.h, z2.h[2]\n"
+      "fmla z22.h, z29.h, z1.h[2]\n"
+      "fmla z26.h, z29.h, z0.h[2]\n"
+      "fmla z11.h, z28.h, z4.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[2]\n"
+      "fmla z19.h, z28.h, z2.h[2]\n"
+      "fmla z23.h, z28.h, z1.h[2]\n"
+      "fmla z27.h, z28.h, z0.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[3]\n"
+      "fmla z12.h, z29.h, z3.h[3]\n"
+      "fmla z16.h, z29.h, z2.h[3]\n"
+      "fmla z20.h, z29.h, z1.h[3]\n"
+      "fmla z24.h, z29.h, z0.h[3]\n"
+      "fmla z9.h, z28.h, z4.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[3]\n"
+      "fmla z17.h, z28.h, z2.h[3]\n"
+      "fmla z21.h, z28.h, z1.h[3]\n"
+      "fmla z25.h, z28.h, z0.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[3]\n"
+      "fmla z14.h, z29.h, z3.h[3]\n"
+      "fmla z18.h, z29.h, z2.h[3]\n"
+      "fmla z22.h, z29.h, z1.h[3]\n"
+      "fmla z26.h, z29.h, z0.h[3]\n"
+      "fmla z11.h, z28.h, z4.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "fmla z15.h, z28.h, z3.h[3]\n"
+      "fmla z19.h, z28.h, z2.h[3]\n"
+      "fmla z23.h, z28.h, z1.h[3]\n"
+      "fmla z27.h, z28.h, z0.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[4]\n"
+      "fmla z12.h, z29.h, z3.h[4]\n"
+      "fmla z16.h, z29.h, z2.h[4]\n"
+      "fmla z20.h, z29.h, z1.h[4]\n"
+      "fmla z24.h, z29.h, z0.h[4]\n"
+      "fmla z9.h, z28.h, z4.h[4]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[4]\n"
+      "fmla z17.h, z28.h, z2.h[4]\n"
+      "fmla z21.h, z28.h, z1.h[4]\n"
+      "fmla z25.h, z28.h, z0.h[4]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[4]\n"
+      "fmla z14.h, z29.h, z3.h[4]\n"
+      "fmla z18.h, z29.h, z2.h[4]\n"
+      "fmla z22.h, z29.h, z1.h[4]\n"
+      "fmla z26.h, z29.h, z0.h[4]\n"
+      "fmla z11.h, z28.h, z4.h[4]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[4]\n"
+      "fmla z19.h, z28.h, z2.h[4]\n"
+      "fmla z23.h, z28.h, z1.h[4]\n"
+      "fmla z27.h, z28.h, z0.h[4]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[5]\n"
+      "fmla z12.h, z29.h, z3.h[5]\n"
+      "fmla z16.h, z29.h, z2.h[5]\n"
+      "fmla z20.h, z29.h, z1.h[5]\n"
+      "fmla z24.h, z29.h, z0.h[5]\n"
+      "fmla z9.h, z28.h, z4.h[5]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[5]\n"
+      "fmla z17.h, z28.h, z2.h[5]\n"
+      "fmla z21.h, z28.h, z1.h[5]\n"
+      "fmla z25.h, z28.h, z0.h[5]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z26.h, z6.h, z4.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "fmla z27.h, z7.h, z4.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
-      "fmla z24.h, z6.h, z4.h[6]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "fmla z25.h, z7.h, z4.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z26.h, z6.h, z4.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "fmla z27.h, z7.h, z4.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "fmla z24.h, z6.h, z4.h[7]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "fmla z25.h, z7.h, z4.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z26.h, z6.h, z4.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
-      "fmla z27.h, z7.h, z4.h[7]\n"
+      "fmla z10.h, z29.h, z4.h[5]\n"
+      "fmla z14.h, z29.h, z3.h[5]\n"
+      "fmla z18.h, z29.h, z2.h[5]\n"
+      "fmla z22.h, z29.h, z1.h[5]\n"
+      "fmla z26.h, z29.h, z0.h[5]\n"
+      "fmla z11.h, z28.h, z4.h[5]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[5]\n"
+      "fmla z19.h, z28.h, z2.h[5]\n"
+      "fmla z23.h, z28.h, z1.h[5]\n"
+      "fmla z27.h, z28.h, z0.h[5]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[6]\n"
+      "fmla z12.h, z29.h, z3.h[6]\n"
+      "fmla z16.h, z29.h, z2.h[6]\n"
+      "fmla z20.h, z29.h, z1.h[6]\n"
+      "fmla z24.h, z29.h, z0.h[6]\n"
+      "fmla z9.h, z28.h, z4.h[6]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[6]\n"
+      "fmla z17.h, z28.h, z2.h[6]\n"
+      "fmla z21.h, z28.h, z1.h[6]\n"
+      "fmla z25.h, z28.h, z0.h[6]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[6]\n"
+      "fmla z14.h, z29.h, z3.h[6]\n"
+      "fmla z18.h, z29.h, z2.h[6]\n"
+      "fmla z22.h, z29.h, z1.h[6]\n"
+      "fmla z26.h, z29.h, z0.h[6]\n"
+      "fmla z11.h, z28.h, z4.h[6]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z15.h, z28.h, z3.h[6]\n"
+      "fmla z19.h, z28.h, z2.h[6]\n"
+      "fmla z23.h, z28.h, z1.h[6]\n"
+      "fmla z27.h, z28.h, z0.h[6]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z29.h, z4.h[7]\n"
+      "fmla z12.h, z29.h, z3.h[7]\n"
+      "fmla z16.h, z29.h, z2.h[7]\n"
+      "fmla z20.h, z29.h, z1.h[7]\n"
+      "fmla z24.h, z29.h, z0.h[7]\n"
+      "fmla z9.h, z28.h, z4.h[7]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z13.h, z28.h, z3.h[7]\n"
+      "fmla z17.h, z28.h, z2.h[7]\n"
+      "fmla z21.h, z28.h, z1.h[7]\n"
+      "fmla z25.h, z28.h, z0.h[7]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z29.h, z4.h[7]\n"
+      "fmla z14.h, z29.h, z3.h[7]\n"
+      "fmla z18.h, z29.h, z2.h[7]\n"
+      "fmla z22.h, z29.h, z1.h[7]\n"
+      "fmla z26.h, z29.h, z0.h[7]\n"
+      "fmla z11.h, z28.h, z4.h[7]\n"
+      "fmla z15.h, z28.h, z3.h[7]\n"
+      "fmla z19.h, z28.h, z2.h[7]\n"
+      "fmla z23.h, z28.h, z1.h[7]\n"
+      "fmla z27.h, z28.h, z0.h[7]\n"
       "bgt 61b\n"
       "62:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
@@ -1989,219 +1989,219 @@
       "ld1rqh { z2.h }, p0/Z, [x24]\n"
       "ld1rqh { z3.h }, p0/Z, [x23]\n"
       "ld1rqh { z4.h }, p0/Z, [x22]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
-      "fmla z24.h, z6.h, z4.h[0]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "fmla z25.h, z7.h, z4.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "fmla z8.h, z29.h, z0.h[0]\n"
+      "fmla z12.h, z29.h, z1.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.h, z29.h, z2.h[0]\n"
+      "fmla z20.h, z29.h, z3.h[0]\n"
+      "fmla z24.h, z29.h, z4.h[0]\n"
+      "fmla z9.h, z28.h, z0.h[0]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[0]\n"
+      "fmla z17.h, z28.h, z2.h[0]\n"
+      "fmla z21.h, z28.h, z3.h[0]\n"
+      "fmla z25.h, z28.h, z4.h[0]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z26.h, z6.h, z4.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "fmla z27.h, z7.h, z4.h[0]\n"
+      "fmla z10.h, z29.h, z0.h[0]\n"
+      "fmla z14.h, z29.h, z1.h[0]\n"
+      "fmla z18.h, z29.h, z2.h[0]\n"
+      "fmla z22.h, z29.h, z3.h[0]\n"
+      "fmla z26.h, z29.h, z4.h[0]\n"
+      "fmla z11.h, z28.h, z0.h[0]\n"
+      "fmla z15.h, z28.h, z1.h[0]\n"
+      "fmla z19.h, z28.h, z2.h[0]\n"
+      "fmla z23.h, z28.h, z3.h[0]\n"
+      "fmla z27.h, z28.h, z4.h[0]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z0.h[1]\n"
+      "fmla z12.h, z29.h, z1.h[1]\n"
+      "fmla z16.h, z29.h, z2.h[1]\n"
+      "fmla z20.h, z29.h, z3.h[1]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.h, z6.h, z4.h[1]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "fmla z25.h, z7.h, z4.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.h, z29.h, z4.h[1]\n"
+      "fmla z9.h, z28.h, z0.h[1]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[1]\n"
+      "fmla z17.h, z28.h, z2.h[1]\n"
+      "fmla z21.h, z28.h, z3.h[1]\n"
+      "fmla z25.h, z28.h, z4.h[1]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z26.h, z6.h, z4.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "fmla z27.h, z7.h, z4.h[1]\n"
+      "fmla z10.h, z29.h, z0.h[1]\n"
+      "fmla z14.h, z29.h, z1.h[1]\n"
+      "fmla z18.h, z29.h, z2.h[1]\n"
+      "fmla z22.h, z29.h, z3.h[1]\n"
+      "fmla z26.h, z29.h, z4.h[1]\n"
+      "fmla z11.h, z28.h, z0.h[1]\n"
+      "fmla z15.h, z28.h, z1.h[1]\n"
+      "fmla z19.h, z28.h, z2.h[1]\n"
+      "fmla z23.h, z28.h, z3.h[1]\n"
+      "fmla z27.h, z28.h, z4.h[1]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z0.h[2]\n"
+      "fmla z12.h, z29.h, z1.h[2]\n"
+      "fmla z16.h, z29.h, z2.h[2]\n"
+      "fmla z20.h, z29.h, z3.h[2]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.h, z6.h, z4.h[2]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "fmla z25.h, z7.h, z4.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.h, z29.h, z4.h[2]\n"
+      "fmla z9.h, z28.h, z0.h[2]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[2]\n"
+      "fmla z17.h, z28.h, z2.h[2]\n"
+      "fmla z21.h, z28.h, z3.h[2]\n"
+      "fmla z25.h, z28.h, z4.h[2]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z26.h, z6.h, z4.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "fmla z27.h, z7.h, z4.h[2]\n"
+      "fmla z10.h, z29.h, z0.h[2]\n"
+      "fmla z14.h, z29.h, z1.h[2]\n"
+      "fmla z18.h, z29.h, z2.h[2]\n"
+      "fmla z22.h, z29.h, z3.h[2]\n"
+      "fmla z26.h, z29.h, z4.h[2]\n"
+      "fmla z11.h, z28.h, z0.h[2]\n"
+      "fmla z15.h, z28.h, z1.h[2]\n"
+      "fmla z19.h, z28.h, z2.h[2]\n"
+      "fmla z23.h, z28.h, z3.h[2]\n"
+      "fmla z27.h, z28.h, z4.h[2]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z0.h[3]\n"
+      "fmla z12.h, z29.h, z1.h[3]\n"
+      "fmla z16.h, z29.h, z2.h[3]\n"
+      "fmla z20.h, z29.h, z3.h[3]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.h, z6.h, z4.h[3]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "fmla z25.h, z7.h, z4.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.h, z29.h, z4.h[3]\n"
+      "fmla z9.h, z28.h, z0.h[3]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[3]\n"
+      "fmla z17.h, z28.h, z2.h[3]\n"
+      "fmla z21.h, z28.h, z3.h[3]\n"
+      "fmla z25.h, z28.h, z4.h[3]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z26.h, z6.h, z4.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "fmla z27.h, z7.h, z4.h[3]\n"
+      "fmla z10.h, z29.h, z0.h[3]\n"
+      "fmla z14.h, z29.h, z1.h[3]\n"
+      "fmla z18.h, z29.h, z2.h[3]\n"
+      "fmla z22.h, z29.h, z3.h[3]\n"
+      "fmla z26.h, z29.h, z4.h[3]\n"
+      "fmla z11.h, z28.h, z0.h[3]\n"
+      "fmla z15.h, z28.h, z1.h[3]\n"
+      "fmla z19.h, z28.h, z2.h[3]\n"
+      "fmla z23.h, z28.h, z3.h[3]\n"
+      "fmla z27.h, z28.h, z4.h[3]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z0.h[4]\n"
+      "fmla z12.h, z29.h, z1.h[4]\n"
+      "fmla z16.h, z29.h, z2.h[4]\n"
+      "fmla z20.h, z29.h, z3.h[4]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.h, z6.h, z4.h[4]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "fmla z25.h, z7.h, z4.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.h, z29.h, z4.h[4]\n"
+      "fmla z9.h, z28.h, z0.h[4]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[4]\n"
+      "fmla z17.h, z28.h, z2.h[4]\n"
+      "fmla z21.h, z28.h, z3.h[4]\n"
+      "fmla z25.h, z28.h, z4.h[4]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z26.h, z6.h, z4.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "fmla z27.h, z7.h, z4.h[4]\n"
+      "fmla z10.h, z29.h, z0.h[4]\n"
+      "fmla z14.h, z29.h, z1.h[4]\n"
+      "fmla z18.h, z29.h, z2.h[4]\n"
+      "fmla z22.h, z29.h, z3.h[4]\n"
+      "fmla z26.h, z29.h, z4.h[4]\n"
+      "fmla z11.h, z28.h, z0.h[4]\n"
+      "fmla z15.h, z28.h, z1.h[4]\n"
+      "fmla z19.h, z28.h, z2.h[4]\n"
+      "fmla z23.h, z28.h, z3.h[4]\n"
+      "fmla z27.h, z28.h, z4.h[4]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z0.h[5]\n"
+      "fmla z12.h, z29.h, z1.h[5]\n"
+      "fmla z16.h, z29.h, z2.h[5]\n"
+      "fmla z20.h, z29.h, z3.h[5]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.h, z6.h, z4.h[5]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "fmla z25.h, z7.h, z4.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.h, z29.h, z4.h[5]\n"
+      "fmla z9.h, z28.h, z0.h[5]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[5]\n"
+      "fmla z17.h, z28.h, z2.h[5]\n"
+      "fmla z21.h, z28.h, z3.h[5]\n"
+      "fmla z25.h, z28.h, z4.h[5]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z26.h, z6.h, z4.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "fmla z27.h, z7.h, z4.h[5]\n"
+      "fmla z10.h, z29.h, z0.h[5]\n"
+      "fmla z14.h, z29.h, z1.h[5]\n"
+      "fmla z18.h, z29.h, z2.h[5]\n"
+      "fmla z22.h, z29.h, z3.h[5]\n"
+      "fmla z26.h, z29.h, z4.h[5]\n"
+      "fmla z11.h, z28.h, z0.h[5]\n"
+      "fmla z15.h, z28.h, z1.h[5]\n"
+      "fmla z19.h, z28.h, z2.h[5]\n"
+      "fmla z23.h, z28.h, z3.h[5]\n"
+      "fmla z27.h, z28.h, z4.h[5]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z0.h[6]\n"
+      "fmla z12.h, z29.h, z1.h[6]\n"
+      "fmla z16.h, z29.h, z2.h[6]\n"
+      "fmla z20.h, z29.h, z3.h[6]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.h, z6.h, z4.h[6]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "fmla z25.h, z7.h, z4.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.h, z29.h, z4.h[6]\n"
+      "fmla z9.h, z28.h, z0.h[6]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[6]\n"
+      "fmla z17.h, z28.h, z2.h[6]\n"
+      "fmla z21.h, z28.h, z3.h[6]\n"
+      "fmla z25.h, z28.h, z4.h[6]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z26.h, z6.h, z4.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "fmla z27.h, z7.h, z4.h[6]\n"
+      "fmla z10.h, z29.h, z0.h[6]\n"
+      "fmla z14.h, z29.h, z1.h[6]\n"
+      "fmla z18.h, z29.h, z2.h[6]\n"
+      "fmla z22.h, z29.h, z3.h[6]\n"
+      "fmla z26.h, z29.h, z4.h[6]\n"
+      "fmla z11.h, z28.h, z0.h[6]\n"
+      "fmla z15.h, z28.h, z1.h[6]\n"
+      "fmla z19.h, z28.h, z2.h[6]\n"
+      "fmla z23.h, z28.h, z3.h[6]\n"
+      "fmla z27.h, z28.h, z4.h[6]\n"
       "ble 63f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "fmla z24.h, z6.h, z4.h[7]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "fmla z25.h, z7.h, z4.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z29.h }, p5/Z, [x10]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z29.h, z0.h[7]\n"
+      "fmla z12.h, z29.h, z1.h[7]\n"
+      "fmla z16.h, z29.h, z2.h[7]\n"
+      "fmla z20.h, z29.h, z3.h[7]\n"
+      "fmla z24.h, z29.h, z4.h[7]\n"
+      "fmla z9.h, z28.h, z0.h[7]\n"
+      "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.h, z28.h, z1.h[7]\n"
+      "fmla z17.h, z28.h, z2.h[7]\n"
+      "fmla z21.h, z28.h, z3.h[7]\n"
+      "fmla z25.h, z28.h, z4.h[7]\n"
+      "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z26.h, z6.h, z4.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
-      "fmla z27.h, z7.h, z4.h[7]\n"
+      "fmla z10.h, z29.h, z0.h[7]\n"
+      "fmla z14.h, z29.h, z1.h[7]\n"
+      "fmla z18.h, z29.h, z2.h[7]\n"
+      "fmla z22.h, z29.h, z3.h[7]\n"
+      "fmla z26.h, z29.h, z4.h[7]\n"
+      "fmla z11.h, z28.h, z0.h[7]\n"
+      "fmla z15.h, z28.h, z1.h[7]\n"
+      "fmla z19.h, z28.h, z2.h[7]\n"
+      "fmla z23.h, z28.h, z3.h[7]\n"
+      "fmla z27.h, z28.h, z4.h[7]\n"
       "63:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -2214,49 +2214,49 @@
       "add x22, x23, x20, LSL #1\n"
       "tbz %x[flags], #1, 64f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rh { z1.h }, p5/Z, [x20]\n"
+      "ld1rh { z29.h }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rh { z0.h }, p5/Z, [x20]\n"
-      "fmin z8.h, p5/M, z8.h, z1.h\n"
-      "fmin z9.h, p5/M, z9.h, z1.h\n"
-      "fmin z10.h, p5/M, z10.h, z1.h\n"
-      "fmin z11.h, p5/M, z11.h, z1.h\n"
-      "fmin z12.h, p5/M, z12.h, z1.h\n"
-      "fmin z13.h, p5/M, z13.h, z1.h\n"
-      "fmin z14.h, p5/M, z14.h, z1.h\n"
-      "fmin z15.h, p5/M, z15.h, z1.h\n"
-      "fmin z16.h, p5/M, z16.h, z1.h\n"
-      "fmin z17.h, p5/M, z17.h, z1.h\n"
-      "fmin z18.h, p5/M, z18.h, z1.h\n"
-      "fmin z19.h, p5/M, z19.h, z1.h\n"
-      "fmin z20.h, p5/M, z20.h, z1.h\n"
-      "fmin z21.h, p5/M, z21.h, z1.h\n"
-      "fmin z22.h, p5/M, z22.h, z1.h\n"
-      "fmin z23.h, p5/M, z23.h, z1.h\n"
-      "fmin z24.h, p5/M, z24.h, z1.h\n"
-      "fmin z25.h, p5/M, z25.h, z1.h\n"
-      "fmin z26.h, p5/M, z26.h, z1.h\n"
-      "fmin z27.h, p5/M, z27.h, z1.h\n"
-      "fmax z8.h, p5/M, z8.h, z0.h\n"
-      "fmax z9.h, p5/M, z9.h, z0.h\n"
-      "fmax z10.h, p5/M, z10.h, z0.h\n"
-      "fmax z11.h, p5/M, z11.h, z0.h\n"
-      "fmax z12.h, p5/M, z12.h, z0.h\n"
-      "fmax z13.h, p5/M, z13.h, z0.h\n"
-      "fmax z14.h, p5/M, z14.h, z0.h\n"
-      "fmax z15.h, p5/M, z15.h, z0.h\n"
-      "fmax z16.h, p5/M, z16.h, z0.h\n"
-      "fmax z17.h, p5/M, z17.h, z0.h\n"
-      "fmax z18.h, p5/M, z18.h, z0.h\n"
-      "fmax z19.h, p5/M, z19.h, z0.h\n"
-      "fmax z20.h, p5/M, z20.h, z0.h\n"
-      "fmax z21.h, p5/M, z21.h, z0.h\n"
-      "fmax z22.h, p5/M, z22.h, z0.h\n"
-      "fmax z23.h, p5/M, z23.h, z0.h\n"
-      "fmax z24.h, p5/M, z24.h, z0.h\n"
-      "fmax z25.h, p5/M, z25.h, z0.h\n"
-      "fmax z26.h, p5/M, z26.h, z0.h\n"
-      "fmax z27.h, p5/M, z27.h, z0.h\n"
+      "ld1rh { z28.h }, p5/Z, [x20]\n"
+      "fmin z8.h, p5/M, z8.h, z29.h\n"
+      "fmin z9.h, p5/M, z9.h, z29.h\n"
+      "fmin z10.h, p5/M, z10.h, z29.h\n"
+      "fmin z11.h, p5/M, z11.h, z29.h\n"
+      "fmin z12.h, p5/M, z12.h, z29.h\n"
+      "fmin z13.h, p5/M, z13.h, z29.h\n"
+      "fmin z14.h, p5/M, z14.h, z29.h\n"
+      "fmin z15.h, p5/M, z15.h, z29.h\n"
+      "fmin z16.h, p5/M, z16.h, z29.h\n"
+      "fmin z17.h, p5/M, z17.h, z29.h\n"
+      "fmin z18.h, p5/M, z18.h, z29.h\n"
+      "fmin z19.h, p5/M, z19.h, z29.h\n"
+      "fmin z20.h, p5/M, z20.h, z29.h\n"
+      "fmin z21.h, p5/M, z21.h, z29.h\n"
+      "fmin z22.h, p5/M, z22.h, z29.h\n"
+      "fmin z23.h, p5/M, z23.h, z29.h\n"
+      "fmin z24.h, p5/M, z24.h, z29.h\n"
+      "fmin z25.h, p5/M, z25.h, z29.h\n"
+      "fmin z26.h, p5/M, z26.h, z29.h\n"
+      "fmin z27.h, p5/M, z27.h, z29.h\n"
+      "fmax z8.h, p5/M, z8.h, z28.h\n"
+      "fmax z9.h, p5/M, z9.h, z28.h\n"
+      "fmax z10.h, p5/M, z10.h, z28.h\n"
+      "fmax z11.h, p5/M, z11.h, z28.h\n"
+      "fmax z12.h, p5/M, z12.h, z28.h\n"
+      "fmax z13.h, p5/M, z13.h, z28.h\n"
+      "fmax z14.h, p5/M, z14.h, z28.h\n"
+      "fmax z15.h, p5/M, z15.h, z28.h\n"
+      "fmax z16.h, p5/M, z16.h, z28.h\n"
+      "fmax z17.h, p5/M, z17.h, z28.h\n"
+      "fmax z18.h, p5/M, z18.h, z28.h\n"
+      "fmax z19.h, p5/M, z19.h, z28.h\n"
+      "fmax z20.h, p5/M, z20.h, z28.h\n"
+      "fmax z21.h, p5/M, z21.h, z28.h\n"
+      "fmax z22.h, p5/M, z22.h, z28.h\n"
+      "fmax z23.h, p5/M, z23.h, z28.h\n"
+      "fmax z24.h, p5/M, z24.h, z28.h\n"
+      "fmax z25.h, p5/M, z25.h, z28.h\n"
+      "fmax z26.h, p5/M, z26.h, z28.h\n"
+      "fmax z27.h, p5/M, z27.h, z28.h\n"
       "64:"  // Height 5: No activation
       "st1h { z8.h }, p4, [x9]\n"
       "st1h { z9.h }, p3, [x9, #1, MUL VL]\n"
@@ -2331,35 +2331,35 @@
       "68:"  // Height 6: no bias
       "tbz %x[flags], #0, 69f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "ld1h { z8.h }, p4/Z, [x9]\n"
+      "add x24, x9, x20, LSL #1\n"
       "add x23, x24, x20, LSL #1\n"
+      "ld1h { z8.h }, p4/Z, [x9]\n"
       "add x22, x23, x20, LSL #1\n"
+      "add x21, x22, x20, LSL #1\n"
       "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x20, x21, x20, LSL #1\n"
       "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1h { z12.h }, p4/Z, [x25]\n"
-      "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1h { z16.h }, p4/Z, [x24]\n"
-      "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1h { z20.h }, p4/Z, [x23]\n"
-      "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1h { z24.h }, p4/Z, [x22]\n"
-      "ld1h { z25.h }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1h { z26.h }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1h { z28.h }, p4/Z, [x21]\n"
-      "ld1h { z29.h }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1h { z30.h }, p2/Z, [x21, #2, MUL VL]\n"
-      "ld1h { z31.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x24]\n"
+      "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x23]\n"
+      "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x22]\n"
+      "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x21]\n"
+      "ld1h { z25.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z26.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z27.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z28.h }, p4/Z, [x20]\n"
+      "ld1h { z29.h }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z30.h }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z31.h }, p1/Z, [x20, #3, MUL VL]\n"
       "b 70f\n"
       "69:"  // Height 6: no accumulate
       "mov z8.b, #0x0\n"
@@ -2391,16 +2391,16 @@
       "71:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 72f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 73f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #1\n"
@@ -2412,256 +2412,256 @@
       "b 73f\n"
       "72:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #1\n"
-      "add x24, x25, x20, LSL #1\n"
-      "add x23, x24, x20, LSL #1\n"
-      "add x22, x23, x20, LSL #1\n"
-      "add x21, x22, x20, LSL #1\n"
+      "add x25, x26, x21, LSL #1\n"
+      "add x24, x25, x21, LSL #1\n"
+      "add x23, x24, x21, LSL #1\n"
+      "add x22, x23, x21, LSL #1\n"
+      "add x21, x22, x21, LSL #1\n"
       "73:"  // Height 6: input setup done
       "cmp x27, #0x8\n"
       "ble 75f\n"
       "74:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.h, XZR, x27\n"
-      "ld1rqh { z0.h }, p0/Z, [x26]\n"
-      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z7.h }, p0/Z, [x26]\n"
+      "ld1rqh { z6.h }, p0/Z, [x25]\n"
       "sub x27, x27, #0x8\n"
-      "ld1rqh { z2.h }, p0/Z, [x24]\n"
-      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "ld1rqh { z5.h }, p0/Z, [x24]\n"
+      "ld1rqh { z4.h }, p0/Z, [x23]\n"
       "cmp x27, #0x8\n"
       "add x26, x26, #0x10\n"
-      "ld1rqh { z4.h }, p0/Z, [x22]\n"
-      "ld1rqh { z5.h }, p0/Z, [x21]\n"
+      "ld1rqh { z3.h }, p0/Z, [x22]\n"
+      "ld1rqh { z2.h }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[0]\n"
+      "fmla z12.h, z1.h, z6.h[0]\n"
+      "fmla z16.h, z1.h, z5.h[0]\n"
+      "fmla z20.h, z1.h, z4.h[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "fmla z24.h, z6.h, z4.h[0]\n"
-      "fmla z28.h, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z24.h, z1.h, z3.h[0]\n"
+      "fmla z28.h, z1.h, z2.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "fmla z25.h, z7.h, z4.h[0]\n"
-      "fmla z29.h, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z26.h, z6.h, z4.h[0]\n"
-      "fmla z30.h, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "fmla z27.h, z7.h, z4.h[0]\n"
-      "fmla z31.h, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
-      "fmla z24.h, z6.h, z4.h[1]\n"
-      "fmla z28.h, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "fmla z25.h, z7.h, z4.h[1]\n"
-      "fmla z29.h, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[0]\n"
+      "fmla z13.h, z0.h, z6.h[0]\n"
+      "fmla z17.h, z0.h, z5.h[0]\n"
+      "fmla z21.h, z0.h, z4.h[0]\n"
+      "fmla z25.h, z0.h, z3.h[0]\n"
+      "fmla z29.h, z0.h, z2.h[0]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[0]\n"
+      "fmla z14.h, z1.h, z6.h[0]\n"
+      "fmla z18.h, z1.h, z5.h[0]\n"
+      "fmla z22.h, z1.h, z4.h[0]\n"
+      "fmla z26.h, z1.h, z3.h[0]\n"
+      "fmla z30.h, z1.h, z2.h[0]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[0]\n"
+      "fmla z15.h, z0.h, z6.h[0]\n"
+      "fmla z19.h, z0.h, z5.h[0]\n"
+      "fmla z23.h, z0.h, z4.h[0]\n"
+      "fmla z27.h, z0.h, z3.h[0]\n"
+      "fmla z31.h, z0.h, z2.h[0]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[1]\n"
+      "fmla z12.h, z1.h, z6.h[1]\n"
+      "fmla z16.h, z1.h, z5.h[1]\n"
+      "fmla z20.h, z1.h, z4.h[1]\n"
+      "fmla z24.h, z1.h, z3.h[1]\n"
+      "fmla z28.h, z1.h, z2.h[1]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[1]\n"
+      "fmla z13.h, z0.h, z6.h[1]\n"
+      "fmla z17.h, z0.h, z5.h[1]\n"
+      "fmla z21.h, z0.h, z4.h[1]\n"
+      "fmla z25.h, z0.h, z3.h[1]\n"
+      "fmla z29.h, z0.h, z2.h[1]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z26.h, z6.h, z4.h[1]\n"
-      "fmla z30.h, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "fmla z27.h, z7.h, z4.h[1]\n"
-      "fmla z31.h, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
-      "fmla z24.h, z6.h, z4.h[2]\n"
-      "fmla z28.h, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "fmla z25.h, z7.h, z4.h[2]\n"
-      "fmla z29.h, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z26.h, z6.h, z4.h[2]\n"
-      "fmla z30.h, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "fmla z27.h, z7.h, z4.h[2]\n"
-      "fmla z31.h, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
-      "fmla z24.h, z6.h, z4.h[3]\n"
-      "fmla z28.h, z6.h, z5.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "fmla z25.h, z7.h, z4.h[3]\n"
-      "fmla z29.h, z7.h, z5.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z26.h, z6.h, z4.h[3]\n"
-      "fmla z30.h, z6.h, z5.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "fmla z27.h, z7.h, z4.h[3]\n"
-      "fmla z31.h, z7.h, z5.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
-      "fmla z24.h, z6.h, z4.h[4]\n"
-      "fmla z28.h, z6.h, z5.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "fmla z25.h, z7.h, z4.h[4]\n"
-      "fmla z29.h, z7.h, z5.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z26.h, z6.h, z4.h[4]\n"
-      "fmla z30.h, z6.h, z5.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "fmla z27.h, z7.h, z4.h[4]\n"
-      "fmla z31.h, z7.h, z5.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
-      "fmla z24.h, z6.h, z4.h[5]\n"
-      "fmla z28.h, z6.h, z5.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "fmla z25.h, z7.h, z4.h[5]\n"
-      "fmla z29.h, z7.h, z5.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[1]\n"
+      "fmla z14.h, z1.h, z6.h[1]\n"
+      "fmla z18.h, z1.h, z5.h[1]\n"
+      "fmla z22.h, z1.h, z4.h[1]\n"
+      "fmla z26.h, z1.h, z3.h[1]\n"
+      "fmla z30.h, z1.h, z2.h[1]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[1]\n"
+      "fmla z15.h, z0.h, z6.h[1]\n"
+      "fmla z19.h, z0.h, z5.h[1]\n"
+      "fmla z23.h, z0.h, z4.h[1]\n"
+      "fmla z27.h, z0.h, z3.h[1]\n"
+      "fmla z31.h, z0.h, z2.h[1]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[2]\n"
+      "fmla z12.h, z1.h, z6.h[2]\n"
+      "fmla z16.h, z1.h, z5.h[2]\n"
+      "fmla z20.h, z1.h, z4.h[2]\n"
+      "fmla z24.h, z1.h, z3.h[2]\n"
+      "fmla z28.h, z1.h, z2.h[2]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[2]\n"
+      "fmla z13.h, z0.h, z6.h[2]\n"
+      "fmla z17.h, z0.h, z5.h[2]\n"
+      "fmla z21.h, z0.h, z4.h[2]\n"
+      "fmla z25.h, z0.h, z3.h[2]\n"
+      "fmla z29.h, z0.h, z2.h[2]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[2]\n"
+      "fmla z14.h, z1.h, z6.h[2]\n"
+      "fmla z18.h, z1.h, z5.h[2]\n"
+      "fmla z22.h, z1.h, z4.h[2]\n"
+      "fmla z26.h, z1.h, z3.h[2]\n"
+      "fmla z30.h, z1.h, z2.h[2]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[2]\n"
+      "fmla z15.h, z0.h, z6.h[2]\n"
+      "fmla z19.h, z0.h, z5.h[2]\n"
+      "fmla z23.h, z0.h, z4.h[2]\n"
+      "fmla z27.h, z0.h, z3.h[2]\n"
+      "fmla z31.h, z0.h, z2.h[2]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[3]\n"
+      "fmla z12.h, z1.h, z6.h[3]\n"
+      "fmla z16.h, z1.h, z5.h[3]\n"
+      "fmla z20.h, z1.h, z4.h[3]\n"
+      "fmla z24.h, z1.h, z3.h[3]\n"
+      "fmla z28.h, z1.h, z2.h[3]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[3]\n"
+      "fmla z13.h, z0.h, z6.h[3]\n"
+      "fmla z17.h, z0.h, z5.h[3]\n"
+      "fmla z21.h, z0.h, z4.h[3]\n"
+      "fmla z25.h, z0.h, z3.h[3]\n"
+      "fmla z29.h, z0.h, z2.h[3]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[3]\n"
+      "fmla z14.h, z1.h, z6.h[3]\n"
+      "fmla z18.h, z1.h, z5.h[3]\n"
+      "fmla z22.h, z1.h, z4.h[3]\n"
+      "fmla z26.h, z1.h, z3.h[3]\n"
+      "fmla z30.h, z1.h, z2.h[3]\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "fmla z11.h, z0.h, z7.h[3]\n"
+      "fmla z15.h, z0.h, z6.h[3]\n"
+      "fmla z19.h, z0.h, z5.h[3]\n"
+      "fmla z23.h, z0.h, z4.h[3]\n"
+      "fmla z27.h, z0.h, z3.h[3]\n"
+      "fmla z31.h, z0.h, z2.h[3]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[4]\n"
+      "fmla z12.h, z1.h, z6.h[4]\n"
+      "fmla z16.h, z1.h, z5.h[4]\n"
+      "fmla z20.h, z1.h, z4.h[4]\n"
+      "fmla z24.h, z1.h, z3.h[4]\n"
+      "fmla z28.h, z1.h, z2.h[4]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[4]\n"
+      "fmla z13.h, z0.h, z6.h[4]\n"
+      "fmla z17.h, z0.h, z5.h[4]\n"
+      "fmla z21.h, z0.h, z4.h[4]\n"
+      "fmla z25.h, z0.h, z3.h[4]\n"
+      "fmla z29.h, z0.h, z2.h[4]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[4]\n"
+      "fmla z14.h, z1.h, z6.h[4]\n"
+      "fmla z18.h, z1.h, z5.h[4]\n"
+      "fmla z22.h, z1.h, z4.h[4]\n"
+      "fmla z26.h, z1.h, z3.h[4]\n"
+      "fmla z30.h, z1.h, z2.h[4]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[4]\n"
+      "fmla z15.h, z0.h, z6.h[4]\n"
+      "fmla z19.h, z0.h, z5.h[4]\n"
+      "fmla z23.h, z0.h, z4.h[4]\n"
+      "fmla z27.h, z0.h, z3.h[4]\n"
+      "fmla z31.h, z0.h, z2.h[4]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[5]\n"
+      "fmla z12.h, z1.h, z6.h[5]\n"
+      "fmla z16.h, z1.h, z5.h[5]\n"
+      "fmla z20.h, z1.h, z4.h[5]\n"
+      "fmla z24.h, z1.h, z3.h[5]\n"
+      "fmla z28.h, z1.h, z2.h[5]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[5]\n"
+      "fmla z13.h, z0.h, z6.h[5]\n"
+      "fmla z17.h, z0.h, z5.h[5]\n"
+      "fmla z21.h, z0.h, z4.h[5]\n"
+      "fmla z25.h, z0.h, z3.h[5]\n"
+      "fmla z29.h, z0.h, z2.h[5]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z26.h, z6.h, z4.h[5]\n"
-      "fmla z30.h, z6.h, z5.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "fmla z27.h, z7.h, z4.h[5]\n"
-      "fmla z31.h, z7.h, z5.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
-      "fmla z24.h, z6.h, z4.h[6]\n"
-      "fmla z28.h, z6.h, z5.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "fmla z25.h, z7.h, z4.h[6]\n"
-      "fmla z29.h, z7.h, z5.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z26.h, z6.h, z4.h[6]\n"
-      "fmla z30.h, z6.h, z5.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "fmla z27.h, z7.h, z4.h[6]\n"
-      "fmla z31.h, z7.h, z5.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "fmla z24.h, z6.h, z4.h[7]\n"
-      "fmla z28.h, z6.h, z5.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "fmla z25.h, z7.h, z4.h[7]\n"
-      "fmla z29.h, z7.h, z5.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z26.h, z6.h, z4.h[7]\n"
-      "fmla z30.h, z6.h, z5.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
-      "fmla z27.h, z7.h, z4.h[7]\n"
-      "fmla z31.h, z7.h, z5.h[7]\n"
+      "fmla z10.h, z1.h, z7.h[5]\n"
+      "fmla z14.h, z1.h, z6.h[5]\n"
+      "fmla z18.h, z1.h, z5.h[5]\n"
+      "fmla z22.h, z1.h, z4.h[5]\n"
+      "fmla z26.h, z1.h, z3.h[5]\n"
+      "fmla z30.h, z1.h, z2.h[5]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[5]\n"
+      "fmla z15.h, z0.h, z6.h[5]\n"
+      "fmla z19.h, z0.h, z5.h[5]\n"
+      "fmla z23.h, z0.h, z4.h[5]\n"
+      "fmla z27.h, z0.h, z3.h[5]\n"
+      "fmla z31.h, z0.h, z2.h[5]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[6]\n"
+      "fmla z12.h, z1.h, z6.h[6]\n"
+      "fmla z16.h, z1.h, z5.h[6]\n"
+      "fmla z20.h, z1.h, z4.h[6]\n"
+      "fmla z24.h, z1.h, z3.h[6]\n"
+      "fmla z28.h, z1.h, z2.h[6]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[6]\n"
+      "fmla z13.h, z0.h, z6.h[6]\n"
+      "fmla z17.h, z0.h, z5.h[6]\n"
+      "fmla z21.h, z0.h, z4.h[6]\n"
+      "fmla z25.h, z0.h, z3.h[6]\n"
+      "fmla z29.h, z0.h, z2.h[6]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[6]\n"
+      "fmla z14.h, z1.h, z6.h[6]\n"
+      "fmla z18.h, z1.h, z5.h[6]\n"
+      "fmla z22.h, z1.h, z4.h[6]\n"
+      "fmla z26.h, z1.h, z3.h[6]\n"
+      "fmla z30.h, z1.h, z2.h[6]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.h, z0.h, z7.h[6]\n"
+      "fmla z15.h, z0.h, z6.h[6]\n"
+      "fmla z19.h, z0.h, z5.h[6]\n"
+      "fmla z23.h, z0.h, z4.h[6]\n"
+      "fmla z27.h, z0.h, z3.h[6]\n"
+      "fmla z31.h, z0.h, z2.h[6]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.h, z1.h, z7.h[7]\n"
+      "fmla z12.h, z1.h, z6.h[7]\n"
+      "fmla z16.h, z1.h, z5.h[7]\n"
+      "fmla z20.h, z1.h, z4.h[7]\n"
+      "fmla z24.h, z1.h, z3.h[7]\n"
+      "fmla z28.h, z1.h, z2.h[7]\n"
+      "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.h, z0.h, z7.h[7]\n"
+      "fmla z13.h, z0.h, z6.h[7]\n"
+      "fmla z17.h, z0.h, z5.h[7]\n"
+      "fmla z21.h, z0.h, z4.h[7]\n"
+      "fmla z25.h, z0.h, z3.h[7]\n"
+      "fmla z29.h, z0.h, z2.h[7]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.h, z1.h, z7.h[7]\n"
+      "fmla z14.h, z1.h, z6.h[7]\n"
+      "fmla z18.h, z1.h, z5.h[7]\n"
+      "fmla z22.h, z1.h, z4.h[7]\n"
+      "fmla z26.h, z1.h, z3.h[7]\n"
+      "fmla z30.h, z1.h, z2.h[7]\n"
+      "fmla z11.h, z0.h, z7.h[7]\n"
+      "fmla z15.h, z0.h, z6.h[7]\n"
+      "fmla z19.h, z0.h, z5.h[7]\n"
+      "fmla z23.h, z0.h, z4.h[7]\n"
+      "fmla z27.h, z0.h, z3.h[7]\n"
+      "fmla z31.h, z0.h, z2.h[7]\n"
       "bgt 74b\n"
       "75:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.h, XZR, x27\n"
@@ -2672,251 +2672,251 @@
       "ld1rqh { z3.h }, p0/Z, [x23]\n"
       "ld1rqh { z4.h }, p0/Z, [x22]\n"
       "ld1rqh { z5.h }, p0/Z, [x21]\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "fmla z20.h, z6.h, z3.h[0]\n"
-      "fmla z24.h, z6.h, z4.h[0]\n"
-      "fmla z28.h, z6.h, z5.h[0]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[0]\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "fmla z21.h, z7.h, z3.h[0]\n"
-      "fmla z25.h, z7.h, z4.h[0]\n"
-      "fmla z29.h, z7.h, z5.h[0]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[0]\n"
+      "fmla z12.h, z7.h, z1.h[0]\n"
+      "fmla z16.h, z7.h, z2.h[0]\n"
+      "fmla z20.h, z7.h, z3.h[0]\n"
+      "fmla z24.h, z7.h, z4.h[0]\n"
+      "fmla z28.h, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[0]\n"
+      "fmla z13.h, z6.h, z1.h[0]\n"
+      "fmla z17.h, z6.h, z2.h[0]\n"
+      "fmla z21.h, z6.h, z3.h[0]\n"
+      "fmla z25.h, z6.h, z4.h[0]\n"
+      "fmla z29.h, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[0]\n"
-      "fmla z14.h, z6.h, z1.h[0]\n"
-      "fmla z18.h, z6.h, z2.h[0]\n"
-      "fmla z22.h, z6.h, z3.h[0]\n"
-      "fmla z26.h, z6.h, z4.h[0]\n"
-      "fmla z30.h, z6.h, z5.h[0]\n"
-      "fmla z11.h, z7.h, z0.h[0]\n"
-      "fmla z15.h, z7.h, z1.h[0]\n"
-      "fmla z19.h, z7.h, z2.h[0]\n"
-      "fmla z23.h, z7.h, z3.h[0]\n"
-      "fmla z27.h, z7.h, z4.h[0]\n"
-      "fmla z31.h, z7.h, z5.h[0]\n"
+      "fmla z10.h, z7.h, z0.h[0]\n"
+      "fmla z14.h, z7.h, z1.h[0]\n"
+      "fmla z18.h, z7.h, z2.h[0]\n"
+      "fmla z22.h, z7.h, z3.h[0]\n"
+      "fmla z26.h, z7.h, z4.h[0]\n"
+      "fmla z30.h, z7.h, z5.h[0]\n"
+      "fmla z11.h, z6.h, z0.h[0]\n"
+      "fmla z15.h, z6.h, z1.h[0]\n"
+      "fmla z19.h, z6.h, z2.h[0]\n"
+      "fmla z23.h, z6.h, z3.h[0]\n"
+      "fmla z27.h, z6.h, z4.h[0]\n"
+      "fmla z31.h, z6.h, z5.h[0]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[1]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z16.h, z6.h, z2.h[1]\n"
-      "fmla z20.h, z6.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[1]\n"
+      "fmla z12.h, z7.h, z1.h[1]\n"
+      "fmla z16.h, z7.h, z2.h[1]\n"
+      "fmla z20.h, z7.h, z3.h[1]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.h, z6.h, z4.h[1]\n"
-      "fmla z28.h, z6.h, z5.h[1]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[1]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z17.h, z7.h, z2.h[1]\n"
-      "fmla z21.h, z7.h, z3.h[1]\n"
-      "fmla z25.h, z7.h, z4.h[1]\n"
-      "fmla z29.h, z7.h, z5.h[1]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.h, z7.h, z4.h[1]\n"
+      "fmla z28.h, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[1]\n"
+      "fmla z13.h, z6.h, z1.h[1]\n"
+      "fmla z17.h, z6.h, z2.h[1]\n"
+      "fmla z21.h, z6.h, z3.h[1]\n"
+      "fmla z25.h, z6.h, z4.h[1]\n"
+      "fmla z29.h, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[1]\n"
-      "fmla z14.h, z6.h, z1.h[1]\n"
-      "fmla z18.h, z6.h, z2.h[1]\n"
-      "fmla z22.h, z6.h, z3.h[1]\n"
-      "fmla z26.h, z6.h, z4.h[1]\n"
-      "fmla z30.h, z6.h, z5.h[1]\n"
-      "fmla z11.h, z7.h, z0.h[1]\n"
-      "fmla z15.h, z7.h, z1.h[1]\n"
-      "fmla z19.h, z7.h, z2.h[1]\n"
-      "fmla z23.h, z7.h, z3.h[1]\n"
-      "fmla z27.h, z7.h, z4.h[1]\n"
-      "fmla z31.h, z7.h, z5.h[1]\n"
+      "fmla z10.h, z7.h, z0.h[1]\n"
+      "fmla z14.h, z7.h, z1.h[1]\n"
+      "fmla z18.h, z7.h, z2.h[1]\n"
+      "fmla z22.h, z7.h, z3.h[1]\n"
+      "fmla z26.h, z7.h, z4.h[1]\n"
+      "fmla z30.h, z7.h, z5.h[1]\n"
+      "fmla z11.h, z6.h, z0.h[1]\n"
+      "fmla z15.h, z6.h, z1.h[1]\n"
+      "fmla z19.h, z6.h, z2.h[1]\n"
+      "fmla z23.h, z6.h, z3.h[1]\n"
+      "fmla z27.h, z6.h, z4.h[1]\n"
+      "fmla z31.h, z6.h, z5.h[1]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[2]\n"
-      "fmla z12.h, z6.h, z1.h[2]\n"
-      "fmla z16.h, z6.h, z2.h[2]\n"
-      "fmla z20.h, z6.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[2]\n"
+      "fmla z12.h, z7.h, z1.h[2]\n"
+      "fmla z16.h, z7.h, z2.h[2]\n"
+      "fmla z20.h, z7.h, z3.h[2]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.h, z6.h, z4.h[2]\n"
-      "fmla z28.h, z6.h, z5.h[2]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[2]\n"
-      "fmla z13.h, z7.h, z1.h[2]\n"
-      "fmla z17.h, z7.h, z2.h[2]\n"
-      "fmla z21.h, z7.h, z3.h[2]\n"
-      "fmla z25.h, z7.h, z4.h[2]\n"
-      "fmla z29.h, z7.h, z5.h[2]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.h, z7.h, z4.h[2]\n"
+      "fmla z28.h, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[2]\n"
+      "fmla z13.h, z6.h, z1.h[2]\n"
+      "fmla z17.h, z6.h, z2.h[2]\n"
+      "fmla z21.h, z6.h, z3.h[2]\n"
+      "fmla z25.h, z6.h, z4.h[2]\n"
+      "fmla z29.h, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[2]\n"
-      "fmla z14.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z2.h[2]\n"
-      "fmla z22.h, z6.h, z3.h[2]\n"
-      "fmla z26.h, z6.h, z4.h[2]\n"
-      "fmla z30.h, z6.h, z5.h[2]\n"
-      "fmla z11.h, z7.h, z0.h[2]\n"
-      "fmla z15.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z2.h[2]\n"
-      "fmla z23.h, z7.h, z3.h[2]\n"
-      "fmla z27.h, z7.h, z4.h[2]\n"
-      "fmla z31.h, z7.h, z5.h[2]\n"
+      "fmla z10.h, z7.h, z0.h[2]\n"
+      "fmla z14.h, z7.h, z1.h[2]\n"
+      "fmla z18.h, z7.h, z2.h[2]\n"
+      "fmla z22.h, z7.h, z3.h[2]\n"
+      "fmla z26.h, z7.h, z4.h[2]\n"
+      "fmla z30.h, z7.h, z5.h[2]\n"
+      "fmla z11.h, z6.h, z0.h[2]\n"
+      "fmla z15.h, z6.h, z1.h[2]\n"
+      "fmla z19.h, z6.h, z2.h[2]\n"
+      "fmla z23.h, z6.h, z3.h[2]\n"
+      "fmla z27.h, z6.h, z4.h[2]\n"
+      "fmla z31.h, z6.h, z5.h[2]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[3]\n"
-      "fmla z12.h, z6.h, z1.h[3]\n"
-      "fmla z16.h, z6.h, z2.h[3]\n"
-      "fmla z20.h, z6.h, z3.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[3]\n"
+      "fmla z12.h, z7.h, z1.h[3]\n"
+      "fmla z16.h, z7.h, z2.h[3]\n"
+      "fmla z20.h, z7.h, z3.h[3]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.h, z6.h, z4.h[3]\n"
-      "fmla z28.h, z6.h, z5.h[3]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[3]\n"
-      "fmla z13.h, z7.h, z1.h[3]\n"
-      "fmla z17.h, z7.h, z2.h[3]\n"
-      "fmla z21.h, z7.h, z3.h[3]\n"
-      "fmla z25.h, z7.h, z4.h[3]\n"
-      "fmla z29.h, z7.h, z5.h[3]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.h, z7.h, z4.h[3]\n"
+      "fmla z28.h, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[3]\n"
+      "fmla z13.h, z6.h, z1.h[3]\n"
+      "fmla z17.h, z6.h, z2.h[3]\n"
+      "fmla z21.h, z6.h, z3.h[3]\n"
+      "fmla z25.h, z6.h, z4.h[3]\n"
+      "fmla z29.h, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[3]\n"
-      "fmla z14.h, z6.h, z1.h[3]\n"
-      "fmla z18.h, z6.h, z2.h[3]\n"
-      "fmla z22.h, z6.h, z3.h[3]\n"
-      "fmla z26.h, z6.h, z4.h[3]\n"
-      "fmla z30.h, z6.h, z5.h[3]\n"
-      "fmla z11.h, z7.h, z0.h[3]\n"
-      "fmla z15.h, z7.h, z1.h[3]\n"
-      "fmla z19.h, z7.h, z2.h[3]\n"
-      "fmla z23.h, z7.h, z3.h[3]\n"
-      "fmla z27.h, z7.h, z4.h[3]\n"
-      "fmla z31.h, z7.h, z5.h[3]\n"
+      "fmla z10.h, z7.h, z0.h[3]\n"
+      "fmla z14.h, z7.h, z1.h[3]\n"
+      "fmla z18.h, z7.h, z2.h[3]\n"
+      "fmla z22.h, z7.h, z3.h[3]\n"
+      "fmla z26.h, z7.h, z4.h[3]\n"
+      "fmla z30.h, z7.h, z5.h[3]\n"
+      "fmla z11.h, z6.h, z0.h[3]\n"
+      "fmla z15.h, z6.h, z1.h[3]\n"
+      "fmla z19.h, z6.h, z2.h[3]\n"
+      "fmla z23.h, z6.h, z3.h[3]\n"
+      "fmla z27.h, z6.h, z4.h[3]\n"
+      "fmla z31.h, z6.h, z5.h[3]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[4]\n"
-      "fmla z12.h, z6.h, z1.h[4]\n"
-      "fmla z16.h, z6.h, z2.h[4]\n"
-      "fmla z20.h, z6.h, z3.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[4]\n"
+      "fmla z12.h, z7.h, z1.h[4]\n"
+      "fmla z16.h, z7.h, z2.h[4]\n"
+      "fmla z20.h, z7.h, z3.h[4]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.h, z6.h, z4.h[4]\n"
-      "fmla z28.h, z6.h, z5.h[4]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[4]\n"
-      "fmla z13.h, z7.h, z1.h[4]\n"
-      "fmla z17.h, z7.h, z2.h[4]\n"
-      "fmla z21.h, z7.h, z3.h[4]\n"
-      "fmla z25.h, z7.h, z4.h[4]\n"
-      "fmla z29.h, z7.h, z5.h[4]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.h, z7.h, z4.h[4]\n"
+      "fmla z28.h, z7.h, z5.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[4]\n"
+      "fmla z13.h, z6.h, z1.h[4]\n"
+      "fmla z17.h, z6.h, z2.h[4]\n"
+      "fmla z21.h, z6.h, z3.h[4]\n"
+      "fmla z25.h, z6.h, z4.h[4]\n"
+      "fmla z29.h, z6.h, z5.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[4]\n"
-      "fmla z14.h, z6.h, z1.h[4]\n"
-      "fmla z18.h, z6.h, z2.h[4]\n"
-      "fmla z22.h, z6.h, z3.h[4]\n"
-      "fmla z26.h, z6.h, z4.h[4]\n"
-      "fmla z30.h, z6.h, z5.h[4]\n"
-      "fmla z11.h, z7.h, z0.h[4]\n"
-      "fmla z15.h, z7.h, z1.h[4]\n"
-      "fmla z19.h, z7.h, z2.h[4]\n"
-      "fmla z23.h, z7.h, z3.h[4]\n"
-      "fmla z27.h, z7.h, z4.h[4]\n"
-      "fmla z31.h, z7.h, z5.h[4]\n"
+      "fmla z10.h, z7.h, z0.h[4]\n"
+      "fmla z14.h, z7.h, z1.h[4]\n"
+      "fmla z18.h, z7.h, z2.h[4]\n"
+      "fmla z22.h, z7.h, z3.h[4]\n"
+      "fmla z26.h, z7.h, z4.h[4]\n"
+      "fmla z30.h, z7.h, z5.h[4]\n"
+      "fmla z11.h, z6.h, z0.h[4]\n"
+      "fmla z15.h, z6.h, z1.h[4]\n"
+      "fmla z19.h, z6.h, z2.h[4]\n"
+      "fmla z23.h, z6.h, z3.h[4]\n"
+      "fmla z27.h, z6.h, z4.h[4]\n"
+      "fmla z31.h, z6.h, z5.h[4]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[5]\n"
-      "fmla z12.h, z6.h, z1.h[5]\n"
-      "fmla z16.h, z6.h, z2.h[5]\n"
-      "fmla z20.h, z6.h, z3.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[5]\n"
+      "fmla z12.h, z7.h, z1.h[5]\n"
+      "fmla z16.h, z7.h, z2.h[5]\n"
+      "fmla z20.h, z7.h, z3.h[5]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.h, z6.h, z4.h[5]\n"
-      "fmla z28.h, z6.h, z5.h[5]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[5]\n"
-      "fmla z13.h, z7.h, z1.h[5]\n"
-      "fmla z17.h, z7.h, z2.h[5]\n"
-      "fmla z21.h, z7.h, z3.h[5]\n"
-      "fmla z25.h, z7.h, z4.h[5]\n"
-      "fmla z29.h, z7.h, z5.h[5]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.h, z7.h, z4.h[5]\n"
+      "fmla z28.h, z7.h, z5.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[5]\n"
+      "fmla z13.h, z6.h, z1.h[5]\n"
+      "fmla z17.h, z6.h, z2.h[5]\n"
+      "fmla z21.h, z6.h, z3.h[5]\n"
+      "fmla z25.h, z6.h, z4.h[5]\n"
+      "fmla z29.h, z6.h, z5.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[5]\n"
-      "fmla z14.h, z6.h, z1.h[5]\n"
-      "fmla z18.h, z6.h, z2.h[5]\n"
-      "fmla z22.h, z6.h, z3.h[5]\n"
-      "fmla z26.h, z6.h, z4.h[5]\n"
-      "fmla z30.h, z6.h, z5.h[5]\n"
-      "fmla z11.h, z7.h, z0.h[5]\n"
-      "fmla z15.h, z7.h, z1.h[5]\n"
-      "fmla z19.h, z7.h, z2.h[5]\n"
-      "fmla z23.h, z7.h, z3.h[5]\n"
-      "fmla z27.h, z7.h, z4.h[5]\n"
-      "fmla z31.h, z7.h, z5.h[5]\n"
+      "fmla z10.h, z7.h, z0.h[5]\n"
+      "fmla z14.h, z7.h, z1.h[5]\n"
+      "fmla z18.h, z7.h, z2.h[5]\n"
+      "fmla z22.h, z7.h, z3.h[5]\n"
+      "fmla z26.h, z7.h, z4.h[5]\n"
+      "fmla z30.h, z7.h, z5.h[5]\n"
+      "fmla z11.h, z6.h, z0.h[5]\n"
+      "fmla z15.h, z6.h, z1.h[5]\n"
+      "fmla z19.h, z6.h, z2.h[5]\n"
+      "fmla z23.h, z6.h, z3.h[5]\n"
+      "fmla z27.h, z6.h, z4.h[5]\n"
+      "fmla z31.h, z6.h, z5.h[5]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[6]\n"
-      "fmla z12.h, z6.h, z1.h[6]\n"
-      "fmla z16.h, z6.h, z2.h[6]\n"
-      "fmla z20.h, z6.h, z3.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[6]\n"
+      "fmla z12.h, z7.h, z1.h[6]\n"
+      "fmla z16.h, z7.h, z2.h[6]\n"
+      "fmla z20.h, z7.h, z3.h[6]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.h, z6.h, z4.h[6]\n"
-      "fmla z28.h, z6.h, z5.h[6]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[6]\n"
-      "fmla z13.h, z7.h, z1.h[6]\n"
-      "fmla z17.h, z7.h, z2.h[6]\n"
-      "fmla z21.h, z7.h, z3.h[6]\n"
-      "fmla z25.h, z7.h, z4.h[6]\n"
-      "fmla z29.h, z7.h, z5.h[6]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.h, z7.h, z4.h[6]\n"
+      "fmla z28.h, z7.h, z5.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[6]\n"
+      "fmla z13.h, z6.h, z1.h[6]\n"
+      "fmla z17.h, z6.h, z2.h[6]\n"
+      "fmla z21.h, z6.h, z3.h[6]\n"
+      "fmla z25.h, z6.h, z4.h[6]\n"
+      "fmla z29.h, z6.h, z5.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[6]\n"
-      "fmla z14.h, z6.h, z1.h[6]\n"
-      "fmla z18.h, z6.h, z2.h[6]\n"
-      "fmla z22.h, z6.h, z3.h[6]\n"
-      "fmla z26.h, z6.h, z4.h[6]\n"
-      "fmla z30.h, z6.h, z5.h[6]\n"
-      "fmla z11.h, z7.h, z0.h[6]\n"
-      "fmla z15.h, z7.h, z1.h[6]\n"
-      "fmla z19.h, z7.h, z2.h[6]\n"
-      "fmla z23.h, z7.h, z3.h[6]\n"
-      "fmla z27.h, z7.h, z4.h[6]\n"
-      "fmla z31.h, z7.h, z5.h[6]\n"
+      "fmla z10.h, z7.h, z0.h[6]\n"
+      "fmla z14.h, z7.h, z1.h[6]\n"
+      "fmla z18.h, z7.h, z2.h[6]\n"
+      "fmla z22.h, z7.h, z3.h[6]\n"
+      "fmla z26.h, z7.h, z4.h[6]\n"
+      "fmla z30.h, z7.h, z5.h[6]\n"
+      "fmla z11.h, z6.h, z0.h[6]\n"
+      "fmla z15.h, z6.h, z1.h[6]\n"
+      "fmla z19.h, z6.h, z2.h[6]\n"
+      "fmla z23.h, z6.h, z3.h[6]\n"
+      "fmla z27.h, z6.h, z4.h[6]\n"
+      "fmla z31.h, z6.h, z5.h[6]\n"
       "ble 76f\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.h, z6.h, z0.h[7]\n"
-      "fmla z12.h, z6.h, z1.h[7]\n"
-      "fmla z16.h, z6.h, z2.h[7]\n"
-      "fmla z20.h, z6.h, z3.h[7]\n"
-      "fmla z24.h, z6.h, z4.h[7]\n"
-      "fmla z28.h, z6.h, z5.h[7]\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.h, z7.h, z0.h[7]\n"
-      "fmla z13.h, z7.h, z1.h[7]\n"
-      "fmla z17.h, z7.h, z2.h[7]\n"
-      "fmla z21.h, z7.h, z3.h[7]\n"
-      "fmla z25.h, z7.h, z4.h[7]\n"
-      "fmla z29.h, z7.h, z5.h[7]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.h, z7.h, z0.h[7]\n"
+      "fmla z12.h, z7.h, z1.h[7]\n"
+      "fmla z16.h, z7.h, z2.h[7]\n"
+      "fmla z20.h, z7.h, z3.h[7]\n"
+      "fmla z24.h, z7.h, z4.h[7]\n"
+      "fmla z28.h, z7.h, z5.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.h, z6.h, z0.h[7]\n"
+      "fmla z13.h, z6.h, z1.h[7]\n"
+      "fmla z17.h, z6.h, z2.h[7]\n"
+      "fmla z21.h, z6.h, z3.h[7]\n"
+      "fmla z25.h, z6.h, z4.h[7]\n"
+      "fmla z29.h, z6.h, z5.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.h, z6.h, z0.h[7]\n"
-      "fmla z14.h, z6.h, z1.h[7]\n"
-      "fmla z18.h, z6.h, z2.h[7]\n"
-      "fmla z22.h, z6.h, z3.h[7]\n"
-      "fmla z26.h, z6.h, z4.h[7]\n"
-      "fmla z30.h, z6.h, z5.h[7]\n"
-      "fmla z11.h, z7.h, z0.h[7]\n"
-      "fmla z15.h, z7.h, z1.h[7]\n"
-      "fmla z19.h, z7.h, z2.h[7]\n"
-      "fmla z23.h, z7.h, z3.h[7]\n"
-      "fmla z27.h, z7.h, z4.h[7]\n"
-      "fmla z31.h, z7.h, z5.h[7]\n"
+      "fmla z10.h, z7.h, z0.h[7]\n"
+      "fmla z14.h, z7.h, z1.h[7]\n"
+      "fmla z18.h, z7.h, z2.h[7]\n"
+      "fmla z22.h, z7.h, z3.h[7]\n"
+      "fmla z26.h, z7.h, z4.h[7]\n"
+      "fmla z30.h, z7.h, z5.h[7]\n"
+      "fmla z11.h, z6.h, z0.h[7]\n"
+      "fmla z15.h, z6.h, z1.h[7]\n"
+      "fmla z19.h, z6.h, z2.h[7]\n"
+      "fmla z23.h, z6.h, z3.h[7]\n"
+      "fmla z27.h, z6.h, z4.h[7]\n"
+      "fmla z31.h, z6.h, z5.h[7]\n"
       "76:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -3023,7 +3023,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "80:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -3031,4 +3030,4 @@
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
index b63b143..880f9d1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -75,13 +75,16 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, float>::value) {
             switch (ci->get_cpu_model()) {
-                case CPUModel::V1:
-                    return { 15.65 };
                 default:
                     return { 6.667 };
+                case CPUModel::A510:
+                    return { 5.41 };
+                case CPUModel::V1:
+                    return { 15.65 };
+                case CPUModel::A64FX:
+                    return { 25.55 };
             }
         }
 
@@ -105,5 +108,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp
index 9ae51af..66481f0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp
@@ -139,11 +139,11 @@
       "6:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 7f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 8f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -159,12 +159,12 @@
       "9:"  // Height 1: Multiply loop: Main loop
       "fmla z8.s, p4/M, z6.s, z0.s\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
-      "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
       "add x26, x26, #0x4\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
       "subs x27, x27, #0x1\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1w { z6.s }, p4/Z, [x10]\n"
@@ -174,27 +174,27 @@
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "fmla z8.s, p4/M, z6.s, z0.s\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
-      "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
       "addvl x10, x10, #4\n"
       "bne 6b\n"
       "tbz %x[flags], #1, 11f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p4/Z, [x20]\n"
+      "ld1rw { z17.s }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p4/Z, [x20]\n"
-      "fmin z8.s, p4/M, z8.s, z1.s\n"
-      "fmin z9.s, p4/M, z9.s, z1.s\n"
-      "fmin z10.s, p4/M, z10.s, z1.s\n"
-      "fmin z11.s, p4/M, z11.s, z1.s\n"
-      "fmax z8.s, p4/M, z8.s, z0.s\n"
-      "fmax z9.s, p4/M, z9.s, z0.s\n"
-      "fmax z10.s, p4/M, z10.s, z0.s\n"
-      "fmax z11.s, p4/M, z11.s, z0.s\n"
+      "ld1rw { z16.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z17.s\n"
+      "fmin z9.s, p4/M, z9.s, z17.s\n"
+      "fmin z10.s, p4/M, z10.s, z17.s\n"
+      "fmin z11.s, p4/M, z11.s, z17.s\n"
+      "fmax z8.s, p4/M, z8.s, z16.s\n"
+      "fmax z9.s, p4/M, z9.s, z16.s\n"
+      "fmax z10.s, p4/M, z10.s, z16.s\n"
+      "fmax z11.s, p4/M, z11.s, z16.s\n"
       "11:"  // Height 1: No activation
       "st1w { z8.s }, p3, [x9]\n"
       "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
@@ -234,15 +234,15 @@
       "15:"  // Height 2: no bias
       "tbz %x[flags], #0, 16f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
       "ld1w { z8.s }, p3/Z, [x9]\n"
       "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x25]\n"
-      "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x20]\n"
+      "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 17f\n"
       "16:"  // Height 2: no accumulate
       "mov z8.b, #0x0\n"
@@ -258,12 +258,12 @@
       "18:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 19f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 20f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -271,7 +271,7 @@
       "b 20f\n"
       "19:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
       "20:"  // Height 2: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -282,18 +282,18 @@
       "21:"  // Height 2: Multiply loop: Main loop
       "fmla z8.s, p4/M, z6.s, z0.s\n"
       "fmla z12.s, p4/M, z6.s, z1.s\n"
-      "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
       "add x26, x26, #0x4\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
-      "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
       "subs x27, x27, #0x1\n"
       "add x25, x25, #0x4\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z14.s, p4/M, z17.s, z1.s\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
+      "fmla z15.s, p4/M, z16.s, z1.s\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
       "ld1w { z6.s }, p4/Z, [x10]\n"
@@ -303,41 +303,41 @@
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "fmla z8.s, p4/M, z6.s, z0.s\n"
       "fmla z12.s, p4/M, z6.s, z1.s\n"
-      "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
-      "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z10.s, p4/M, z17.s, z0.s\n"
+      "fmla z14.s, p4/M, z17.s, z1.s\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "fmla z11.s, p4/M, z16.s, z0.s\n"
+      "fmla z15.s, p4/M, z16.s, z1.s\n"
       "bne 18b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x25, x9, x20, LSL #2\n"
       "tbz %x[flags], #1, 23f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p4/Z, [x20]\n"
+      "ld1rw { z17.s }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p4/Z, [x20]\n"
-      "fmin z8.s, p4/M, z8.s, z1.s\n"
-      "fmin z9.s, p4/M, z9.s, z1.s\n"
-      "fmin z10.s, p4/M, z10.s, z1.s\n"
-      "fmin z11.s, p4/M, z11.s, z1.s\n"
-      "fmin z12.s, p4/M, z12.s, z1.s\n"
-      "fmin z13.s, p4/M, z13.s, z1.s\n"
-      "fmin z14.s, p4/M, z14.s, z1.s\n"
-      "fmin z15.s, p4/M, z15.s, z1.s\n"
-      "fmax z8.s, p4/M, z8.s, z0.s\n"
-      "fmax z9.s, p4/M, z9.s, z0.s\n"
-      "fmax z10.s, p4/M, z10.s, z0.s\n"
-      "fmax z11.s, p4/M, z11.s, z0.s\n"
-      "fmax z12.s, p4/M, z12.s, z0.s\n"
-      "fmax z13.s, p4/M, z13.s, z0.s\n"
-      "fmax z14.s, p4/M, z14.s, z0.s\n"
-      "fmax z15.s, p4/M, z15.s, z0.s\n"
+      "ld1rw { z16.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z17.s\n"
+      "fmin z9.s, p4/M, z9.s, z17.s\n"
+      "fmin z10.s, p4/M, z10.s, z17.s\n"
+      "fmin z11.s, p4/M, z11.s, z17.s\n"
+      "fmin z12.s, p4/M, z12.s, z17.s\n"
+      "fmin z13.s, p4/M, z13.s, z17.s\n"
+      "fmin z14.s, p4/M, z14.s, z17.s\n"
+      "fmin z15.s, p4/M, z15.s, z17.s\n"
+      "fmax z8.s, p4/M, z8.s, z16.s\n"
+      "fmax z9.s, p4/M, z9.s, z16.s\n"
+      "fmax z10.s, p4/M, z10.s, z16.s\n"
+      "fmax z11.s, p4/M, z11.s, z16.s\n"
+      "fmax z12.s, p4/M, z12.s, z16.s\n"
+      "fmax z13.s, p4/M, z13.s, z16.s\n"
+      "fmax z14.s, p4/M, z14.s, z16.s\n"
+      "fmax z15.s, p4/M, z15.s, z16.s\n"
       "23:"  // Height 2: No activation
       "st1w { z8.s }, p3, [x9]\n"
       "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
@@ -385,20 +385,20 @@
       "27:"  // Height 3: no bias
       "tbz %x[flags], #0, 28f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z8.s }, p3/Z, [x9]\n"
       "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x25]\n"
-      "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p3/Z, [x24]\n"
-      "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x21]\n"
+      "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x20]\n"
+      "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 29f\n"
       "28:"  // Height 3: no accumulate
       "mov z8.b, #0x0\n"
@@ -418,13 +418,13 @@
       "30:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 31f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 32f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -433,8 +433,8 @@
       "b 32f\n"
       "31:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "32:"  // Height 3: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -450,21 +450,21 @@
       "subs x27, x27, #0x1\n"
       "fmla z16.s, p4/M, z6.s, z2.s\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
-      "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z21.s }, p4/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x4\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
       "fmla z17.s, p4/M, z7.s, z2.s\n"
-      "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
       "add x24, x24, #0x4\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
-      "fmla z18.s, p4/M, z6.s, z2.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z10.s, p4/M, z21.s, z0.s\n"
+      "fmla z14.s, p4/M, z21.s, z1.s\n"
+      "fmla z18.s, p4/M, z21.s, z2.s\n"
+      "fmla z11.s, p4/M, z20.s, z0.s\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1w { z6.s }, p4/Z, [x10]\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
-      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z15.s, p4/M, z20.s, z1.s\n"
+      "fmla z19.s, p4/M, z20.s, z2.s\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
       "ld1rw { z2.s }, p4/Z, [x24]\n"
       "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
@@ -476,51 +476,51 @@
       "add x28, x28, #0x1\n"
       "fmla z16.s, p4/M, z6.s, z2.s\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
-      "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z21.s }, p4/Z, [x10, #2, MUL VL]\n"
       "cmp x28, x20\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
       "fmla z17.s, p4/M, z7.s, z2.s\n"
-      "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
-      "fmla z18.s, p4/M, z6.s, z2.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
-      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z10.s, p4/M, z21.s, z0.s\n"
+      "fmla z14.s, p4/M, z21.s, z1.s\n"
+      "fmla z18.s, p4/M, z21.s, z2.s\n"
+      "fmla z11.s, p4/M, z20.s, z0.s\n"
+      "fmla z15.s, p4/M, z20.s, z1.s\n"
+      "fmla z19.s, p4/M, z20.s, z2.s\n"
       "bne 30b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x25, x9, x20, LSL #2\n"
       "add x24, x25, x20, LSL #2\n"
       "tbz %x[flags], #1, 35f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p4/Z, [x20]\n"
+      "ld1rw { z21.s }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p4/Z, [x20]\n"
-      "fmin z8.s, p4/M, z8.s, z1.s\n"
-      "fmin z9.s, p4/M, z9.s, z1.s\n"
-      "fmin z10.s, p4/M, z10.s, z1.s\n"
-      "fmin z11.s, p4/M, z11.s, z1.s\n"
-      "fmin z12.s, p4/M, z12.s, z1.s\n"
-      "fmin z13.s, p4/M, z13.s, z1.s\n"
-      "fmin z14.s, p4/M, z14.s, z1.s\n"
-      "fmin z15.s, p4/M, z15.s, z1.s\n"
-      "fmin z16.s, p4/M, z16.s, z1.s\n"
-      "fmin z17.s, p4/M, z17.s, z1.s\n"
-      "fmin z18.s, p4/M, z18.s, z1.s\n"
-      "fmin z19.s, p4/M, z19.s, z1.s\n"
-      "fmax z8.s, p4/M, z8.s, z0.s\n"
-      "fmax z9.s, p4/M, z9.s, z0.s\n"
-      "fmax z10.s, p4/M, z10.s, z0.s\n"
-      "fmax z11.s, p4/M, z11.s, z0.s\n"
-      "fmax z12.s, p4/M, z12.s, z0.s\n"
-      "fmax z13.s, p4/M, z13.s, z0.s\n"
-      "fmax z14.s, p4/M, z14.s, z0.s\n"
-      "fmax z15.s, p4/M, z15.s, z0.s\n"
-      "fmax z16.s, p4/M, z16.s, z0.s\n"
-      "fmax z17.s, p4/M, z17.s, z0.s\n"
-      "fmax z18.s, p4/M, z18.s, z0.s\n"
-      "fmax z19.s, p4/M, z19.s, z0.s\n"
+      "ld1rw { z20.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z21.s\n"
+      "fmin z9.s, p4/M, z9.s, z21.s\n"
+      "fmin z10.s, p4/M, z10.s, z21.s\n"
+      "fmin z11.s, p4/M, z11.s, z21.s\n"
+      "fmin z12.s, p4/M, z12.s, z21.s\n"
+      "fmin z13.s, p4/M, z13.s, z21.s\n"
+      "fmin z14.s, p4/M, z14.s, z21.s\n"
+      "fmin z15.s, p4/M, z15.s, z21.s\n"
+      "fmin z16.s, p4/M, z16.s, z21.s\n"
+      "fmin z17.s, p4/M, z17.s, z21.s\n"
+      "fmin z18.s, p4/M, z18.s, z21.s\n"
+      "fmin z19.s, p4/M, z19.s, z21.s\n"
+      "fmax z8.s, p4/M, z8.s, z20.s\n"
+      "fmax z9.s, p4/M, z9.s, z20.s\n"
+      "fmax z10.s, p4/M, z10.s, z20.s\n"
+      "fmax z11.s, p4/M, z11.s, z20.s\n"
+      "fmax z12.s, p4/M, z12.s, z20.s\n"
+      "fmax z13.s, p4/M, z13.s, z20.s\n"
+      "fmax z14.s, p4/M, z14.s, z20.s\n"
+      "fmax z15.s, p4/M, z15.s, z20.s\n"
+      "fmax z16.s, p4/M, z16.s, z20.s\n"
+      "fmax z17.s, p4/M, z17.s, z20.s\n"
+      "fmax z18.s, p4/M, z18.s, z20.s\n"
+      "fmax z19.s, p4/M, z19.s, z20.s\n"
       "35:"  // Height 3: No activation
       "st1w { z8.s }, p3, [x9]\n"
       "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
@@ -576,25 +576,25 @@
       "39:"  // Height 4: no bias
       "tbz %x[flags], #0, 40f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z8.s }, p3/Z, [x9]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x25]\n"
-      "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p3/Z, [x24]\n"
-      "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p3/Z, [x23]\n"
-      "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x22]\n"
+      "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x21]\n"
+      "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x20]\n"
+      "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 41f\n"
       "40:"  // Height 4: no accumulate
       "mov z8.b, #0x0\n"
@@ -618,14 +618,14 @@
       "42:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 43f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 44f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -635,9 +635,9 @@
       "b 44f\n"
       "43:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "44:"  // Height 4: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -654,7 +654,7 @@
       "subs x27, x27, #0x1\n"
       "fmla z16.s, p4/M, z6.s, z2.s\n"
       "fmla z20.s, p4/M, z6.s, z3.s\n"
-      "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x4\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
@@ -662,19 +662,19 @@
       "add x23, x23, #0x4\n"
       "fmla z17.s, p4/M, z7.s, z2.s\n"
       "fmla z21.s, p4/M, z7.s, z3.s\n"
-      "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
-      "fmla z18.s, p4/M, z6.s, z2.s\n"
-      "fmla z22.s, p4/M, z6.s, z3.s\n"
+      "fmla z10.s, p4/M, z25.s, z0.s\n"
+      "fmla z14.s, p4/M, z25.s, z1.s\n"
+      "fmla z18.s, p4/M, z25.s, z2.s\n"
+      "fmla z22.s, p4/M, z25.s, z3.s\n"
       "ld1w { z6.s }, p4/Z, [x10]\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "fmla z11.s, p4/M, z24.s, z0.s\n"
+      "fmla z15.s, p4/M, z24.s, z1.s\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
-      "fmla z19.s, p4/M, z7.s, z2.s\n"
-      "fmla z23.s, p4/M, z7.s, z3.s\n"
+      "fmla z19.s, p4/M, z24.s, z2.s\n"
+      "fmla z23.s, p4/M, z24.s, z3.s\n"
       "ld1rw { z2.s }, p4/Z, [x24]\n"
       "ld1rw { z3.s }, p4/Z, [x23]\n"
       "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
@@ -686,22 +686,22 @@
       "add x28, x28, #0x1\n"
       "fmla z16.s, p4/M, z6.s, z2.s\n"
       "fmla z20.s, p4/M, z6.s, z3.s\n"
-      "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x10, #2, MUL VL]\n"
       "cmp x28, x20\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
       "fmla z17.s, p4/M, z7.s, z2.s\n"
       "fmla z21.s, p4/M, z7.s, z3.s\n"
-      "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
-      "fmla z18.s, p4/M, z6.s, z2.s\n"
-      "fmla z22.s, p4/M, z6.s, z3.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
-      "fmla z19.s, p4/M, z7.s, z2.s\n"
-      "fmla z23.s, p4/M, z7.s, z3.s\n"
+      "fmla z10.s, p4/M, z25.s, z0.s\n"
+      "fmla z14.s, p4/M, z25.s, z1.s\n"
+      "fmla z18.s, p4/M, z25.s, z2.s\n"
+      "fmla z22.s, p4/M, z25.s, z3.s\n"
+      "fmla z11.s, p4/M, z24.s, z0.s\n"
+      "fmla z15.s, p4/M, z24.s, z1.s\n"
+      "fmla z19.s, p4/M, z24.s, z2.s\n"
+      "fmla z23.s, p4/M, z24.s, z3.s\n"
       "bne 42b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x25, x9, x20, LSL #2\n"
@@ -709,41 +709,41 @@
       "add x23, x24, x20, LSL #2\n"
       "tbz %x[flags], #1, 47f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p4/Z, [x20]\n"
+      "ld1rw { z25.s }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p4/Z, [x20]\n"
-      "fmin z8.s, p4/M, z8.s, z1.s\n"
-      "fmin z9.s, p4/M, z9.s, z1.s\n"
-      "fmin z10.s, p4/M, z10.s, z1.s\n"
-      "fmin z11.s, p4/M, z11.s, z1.s\n"
-      "fmin z12.s, p4/M, z12.s, z1.s\n"
-      "fmin z13.s, p4/M, z13.s, z1.s\n"
-      "fmin z14.s, p4/M, z14.s, z1.s\n"
-      "fmin z15.s, p4/M, z15.s, z1.s\n"
-      "fmin z16.s, p4/M, z16.s, z1.s\n"
-      "fmin z17.s, p4/M, z17.s, z1.s\n"
-      "fmin z18.s, p4/M, z18.s, z1.s\n"
-      "fmin z19.s, p4/M, z19.s, z1.s\n"
-      "fmin z20.s, p4/M, z20.s, z1.s\n"
-      "fmin z21.s, p4/M, z21.s, z1.s\n"
-      "fmin z22.s, p4/M, z22.s, z1.s\n"
-      "fmin z23.s, p4/M, z23.s, z1.s\n"
-      "fmax z8.s, p4/M, z8.s, z0.s\n"
-      "fmax z9.s, p4/M, z9.s, z0.s\n"
-      "fmax z10.s, p4/M, z10.s, z0.s\n"
-      "fmax z11.s, p4/M, z11.s, z0.s\n"
-      "fmax z12.s, p4/M, z12.s, z0.s\n"
-      "fmax z13.s, p4/M, z13.s, z0.s\n"
-      "fmax z14.s, p4/M, z14.s, z0.s\n"
-      "fmax z15.s, p4/M, z15.s, z0.s\n"
-      "fmax z16.s, p4/M, z16.s, z0.s\n"
-      "fmax z17.s, p4/M, z17.s, z0.s\n"
-      "fmax z18.s, p4/M, z18.s, z0.s\n"
-      "fmax z19.s, p4/M, z19.s, z0.s\n"
-      "fmax z20.s, p4/M, z20.s, z0.s\n"
-      "fmax z21.s, p4/M, z21.s, z0.s\n"
-      "fmax z22.s, p4/M, z22.s, z0.s\n"
-      "fmax z23.s, p4/M, z23.s, z0.s\n"
+      "ld1rw { z24.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z25.s\n"
+      "fmin z9.s, p4/M, z9.s, z25.s\n"
+      "fmin z10.s, p4/M, z10.s, z25.s\n"
+      "fmin z11.s, p4/M, z11.s, z25.s\n"
+      "fmin z12.s, p4/M, z12.s, z25.s\n"
+      "fmin z13.s, p4/M, z13.s, z25.s\n"
+      "fmin z14.s, p4/M, z14.s, z25.s\n"
+      "fmin z15.s, p4/M, z15.s, z25.s\n"
+      "fmin z16.s, p4/M, z16.s, z25.s\n"
+      "fmin z17.s, p4/M, z17.s, z25.s\n"
+      "fmin z18.s, p4/M, z18.s, z25.s\n"
+      "fmin z19.s, p4/M, z19.s, z25.s\n"
+      "fmin z20.s, p4/M, z20.s, z25.s\n"
+      "fmin z21.s, p4/M, z21.s, z25.s\n"
+      "fmin z22.s, p4/M, z22.s, z25.s\n"
+      "fmin z23.s, p4/M, z23.s, z25.s\n"
+      "fmax z8.s, p4/M, z8.s, z24.s\n"
+      "fmax z9.s, p4/M, z9.s, z24.s\n"
+      "fmax z10.s, p4/M, z10.s, z24.s\n"
+      "fmax z11.s, p4/M, z11.s, z24.s\n"
+      "fmax z12.s, p4/M, z12.s, z24.s\n"
+      "fmax z13.s, p4/M, z13.s, z24.s\n"
+      "fmax z14.s, p4/M, z14.s, z24.s\n"
+      "fmax z15.s, p4/M, z15.s, z24.s\n"
+      "fmax z16.s, p4/M, z16.s, z24.s\n"
+      "fmax z17.s, p4/M, z17.s, z24.s\n"
+      "fmax z18.s, p4/M, z18.s, z24.s\n"
+      "fmax z19.s, p4/M, z19.s, z24.s\n"
+      "fmax z20.s, p4/M, z20.s, z24.s\n"
+      "fmax z21.s, p4/M, z21.s, z24.s\n"
+      "fmax z22.s, p4/M, z22.s, z24.s\n"
+      "fmax z23.s, p4/M, z23.s, z24.s\n"
       "47:"  // Height 4: No activation
       "st1w { z8.s }, p3, [x9]\n"
       "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
@@ -807,30 +807,30 @@
       "51:"  // Height 5: no bias
       "tbz %x[flags], #0, 52f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z8.s }, p3/Z, [x9]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x25]\n"
-      "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p3/Z, [x24]\n"
-      "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p3/Z, [x23]\n"
-      "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z24.s }, p3/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p1/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x20]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 53f\n"
       "52:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -858,15 +858,15 @@
       "54:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 55f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 56f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -877,10 +877,10 @@
       "b 56f\n"
       "55:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "56:"  // Height 5: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -902,29 +902,29 @@
       "add x24, x24, #0x4\n"
       "fmla z24.s, p4/M, z6.s, z4.s\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
-      "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z29.s }, p4/Z, [x10, #2, MUL VL]\n"
       "add x23, x23, #0x4\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
       "fmla z17.s, p4/M, z7.s, z2.s\n"
       "add x22, x22, #0x4\n"
       "fmla z21.s, p4/M, z7.s, z3.s\n"
       "fmla z25.s, p4/M, z7.s, z4.s\n"
-      "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
-      "fmla z18.s, p4/M, z6.s, z2.s\n"
-      "fmla z22.s, p4/M, z6.s, z3.s\n"
-      "fmla z26.s, p4/M, z6.s, z4.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z10.s, p4/M, z29.s, z0.s\n"
+      "fmla z14.s, p4/M, z29.s, z1.s\n"
+      "fmla z18.s, p4/M, z29.s, z2.s\n"
+      "fmla z22.s, p4/M, z29.s, z3.s\n"
+      "fmla z26.s, p4/M, z29.s, z4.s\n"
+      "fmla z11.s, p4/M, z28.s, z0.s\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1w { z6.s }, p4/Z, [x10]\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
-      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z15.s, p4/M, z28.s, z1.s\n"
+      "fmla z19.s, p4/M, z28.s, z2.s\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
       "ld1rw { z2.s }, p4/Z, [x24]\n"
-      "fmla z23.s, p4/M, z7.s, z3.s\n"
-      "fmla z27.s, p4/M, z7.s, z4.s\n"
+      "fmla z23.s, p4/M, z28.s, z3.s\n"
+      "fmla z27.s, p4/M, z28.s, z4.s\n"
       "ld1rw { z3.s }, p4/Z, [x23]\n"
       "ld1rw { z4.s }, p4/Z, [x22]\n"
       "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n"
@@ -939,23 +939,23 @@
       "cmp x28, x20\n"
       "fmla z24.s, p4/M, z6.s, z4.s\n"
       "fmla z9.s, p4/M, z7.s, z0.s\n"
-      "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z29.s }, p4/Z, [x10, #2, MUL VL]\n"
       "fmla z13.s, p4/M, z7.s, z1.s\n"
       "fmla z17.s, p4/M, z7.s, z2.s\n"
       "fmla z21.s, p4/M, z7.s, z3.s\n"
       "fmla z25.s, p4/M, z7.s, z4.s\n"
-      "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, p4/M, z6.s, z0.s\n"
-      "fmla z14.s, p4/M, z6.s, z1.s\n"
-      "fmla z18.s, p4/M, z6.s, z2.s\n"
-      "fmla z22.s, p4/M, z6.s, z3.s\n"
-      "fmla z26.s, p4/M, z6.s, z4.s\n"
-      "fmla z11.s, p4/M, z7.s, z0.s\n"
-      "fmla z15.s, p4/M, z7.s, z1.s\n"
-      "fmla z19.s, p4/M, z7.s, z2.s\n"
-      "fmla z23.s, p4/M, z7.s, z3.s\n"
-      "fmla z27.s, p4/M, z7.s, z4.s\n"
+      "fmla z10.s, p4/M, z29.s, z0.s\n"
+      "fmla z14.s, p4/M, z29.s, z1.s\n"
+      "fmla z18.s, p4/M, z29.s, z2.s\n"
+      "fmla z22.s, p4/M, z29.s, z3.s\n"
+      "fmla z26.s, p4/M, z29.s, z4.s\n"
+      "fmla z11.s, p4/M, z28.s, z0.s\n"
+      "fmla z15.s, p4/M, z28.s, z1.s\n"
+      "fmla z19.s, p4/M, z28.s, z2.s\n"
+      "fmla z23.s, p4/M, z28.s, z3.s\n"
+      "fmla z27.s, p4/M, z28.s, z4.s\n"
       "bne 54b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x25, x9, x20, LSL #2\n"
@@ -964,49 +964,49 @@
       "add x22, x23, x20, LSL #2\n"
       "tbz %x[flags], #1, 59f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p4/Z, [x20]\n"
+      "ld1rw { z29.s }, p4/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p4/Z, [x20]\n"
-      "fmin z8.s, p4/M, z8.s, z1.s\n"
-      "fmin z9.s, p4/M, z9.s, z1.s\n"
-      "fmin z10.s, p4/M, z10.s, z1.s\n"
-      "fmin z11.s, p4/M, z11.s, z1.s\n"
-      "fmin z12.s, p4/M, z12.s, z1.s\n"
-      "fmin z13.s, p4/M, z13.s, z1.s\n"
-      "fmin z14.s, p4/M, z14.s, z1.s\n"
-      "fmin z15.s, p4/M, z15.s, z1.s\n"
-      "fmin z16.s, p4/M, z16.s, z1.s\n"
-      "fmin z17.s, p4/M, z17.s, z1.s\n"
-      "fmin z18.s, p4/M, z18.s, z1.s\n"
-      "fmin z19.s, p4/M, z19.s, z1.s\n"
-      "fmin z20.s, p4/M, z20.s, z1.s\n"
-      "fmin z21.s, p4/M, z21.s, z1.s\n"
-      "fmin z22.s, p4/M, z22.s, z1.s\n"
-      "fmin z23.s, p4/M, z23.s, z1.s\n"
-      "fmin z24.s, p4/M, z24.s, z1.s\n"
-      "fmin z25.s, p4/M, z25.s, z1.s\n"
-      "fmin z26.s, p4/M, z26.s, z1.s\n"
-      "fmin z27.s, p4/M, z27.s, z1.s\n"
-      "fmax z8.s, p4/M, z8.s, z0.s\n"
-      "fmax z9.s, p4/M, z9.s, z0.s\n"
-      "fmax z10.s, p4/M, z10.s, z0.s\n"
-      "fmax z11.s, p4/M, z11.s, z0.s\n"
-      "fmax z12.s, p4/M, z12.s, z0.s\n"
-      "fmax z13.s, p4/M, z13.s, z0.s\n"
-      "fmax z14.s, p4/M, z14.s, z0.s\n"
-      "fmax z15.s, p4/M, z15.s, z0.s\n"
-      "fmax z16.s, p4/M, z16.s, z0.s\n"
-      "fmax z17.s, p4/M, z17.s, z0.s\n"
-      "fmax z18.s, p4/M, z18.s, z0.s\n"
-      "fmax z19.s, p4/M, z19.s, z0.s\n"
-      "fmax z20.s, p4/M, z20.s, z0.s\n"
-      "fmax z21.s, p4/M, z21.s, z0.s\n"
-      "fmax z22.s, p4/M, z22.s, z0.s\n"
-      "fmax z23.s, p4/M, z23.s, z0.s\n"
-      "fmax z24.s, p4/M, z24.s, z0.s\n"
-      "fmax z25.s, p4/M, z25.s, z0.s\n"
-      "fmax z26.s, p4/M, z26.s, z0.s\n"
-      "fmax z27.s, p4/M, z27.s, z0.s\n"
+      "ld1rw { z28.s }, p4/Z, [x20]\n"
+      "fmin z8.s, p4/M, z8.s, z29.s\n"
+      "fmin z9.s, p4/M, z9.s, z29.s\n"
+      "fmin z10.s, p4/M, z10.s, z29.s\n"
+      "fmin z11.s, p4/M, z11.s, z29.s\n"
+      "fmin z12.s, p4/M, z12.s, z29.s\n"
+      "fmin z13.s, p4/M, z13.s, z29.s\n"
+      "fmin z14.s, p4/M, z14.s, z29.s\n"
+      "fmin z15.s, p4/M, z15.s, z29.s\n"
+      "fmin z16.s, p4/M, z16.s, z29.s\n"
+      "fmin z17.s, p4/M, z17.s, z29.s\n"
+      "fmin z18.s, p4/M, z18.s, z29.s\n"
+      "fmin z19.s, p4/M, z19.s, z29.s\n"
+      "fmin z20.s, p4/M, z20.s, z29.s\n"
+      "fmin z21.s, p4/M, z21.s, z29.s\n"
+      "fmin z22.s, p4/M, z22.s, z29.s\n"
+      "fmin z23.s, p4/M, z23.s, z29.s\n"
+      "fmin z24.s, p4/M, z24.s, z29.s\n"
+      "fmin z25.s, p4/M, z25.s, z29.s\n"
+      "fmin z26.s, p4/M, z26.s, z29.s\n"
+      "fmin z27.s, p4/M, z27.s, z29.s\n"
+      "fmax z8.s, p4/M, z8.s, z28.s\n"
+      "fmax z9.s, p4/M, z9.s, z28.s\n"
+      "fmax z10.s, p4/M, z10.s, z28.s\n"
+      "fmax z11.s, p4/M, z11.s, z28.s\n"
+      "fmax z12.s, p4/M, z12.s, z28.s\n"
+      "fmax z13.s, p4/M, z13.s, z28.s\n"
+      "fmax z14.s, p4/M, z14.s, z28.s\n"
+      "fmax z15.s, p4/M, z15.s, z28.s\n"
+      "fmax z16.s, p4/M, z16.s, z28.s\n"
+      "fmax z17.s, p4/M, z17.s, z28.s\n"
+      "fmax z18.s, p4/M, z18.s, z28.s\n"
+      "fmax z19.s, p4/M, z19.s, z28.s\n"
+      "fmax z20.s, p4/M, z20.s, z28.s\n"
+      "fmax z21.s, p4/M, z21.s, z28.s\n"
+      "fmax z22.s, p4/M, z22.s, z28.s\n"
+      "fmax z23.s, p4/M, z23.s, z28.s\n"
+      "fmax z24.s, p4/M, z24.s, z28.s\n"
+      "fmax z25.s, p4/M, z25.s, z28.s\n"
+      "fmax z26.s, p4/M, z26.s, z28.s\n"
+      "fmax z27.s, p4/M, z27.s, z28.s\n"
       "59:"  // Height 5: No activation
       "st1w { z8.s }, p3, [x9]\n"
       "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
@@ -1081,35 +1081,35 @@
       "63:"  // Height 6: no bias
       "tbz %x[flags], #0, 64f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "add x24, x9, x20, LSL #2\n"
       "add x23, x24, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
       "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x25]\n"
-      "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p3/Z, [x24]\n"
-      "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p3/Z, [x23]\n"
-      "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z24.s }, p3/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p1/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p0/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p3/Z, [x21]\n"
-      "ld1w { z29.s }, p2/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z30.s }, p1/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z31.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x24]\n"
+      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23]\n"
+      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x22]\n"
+      "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x21]\n"
+      "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p3/Z, [x20]\n"
+      "ld1w { z29.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z31.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 65f\n"
       "64:"  // Height 6: no accumulate
       "mov z8.b, #0x0\n"
@@ -1141,16 +1141,16 @@
       "66:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 67f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 68f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -1162,11 +1162,11 @@
       "b 68f\n"
       "67:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "68:"  // Height 6: input setup done
       "subs x27, x27, #0x1\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -1355,7 +1355,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "74:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1363,4 +1362,4 @@
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
index 71c6afb..e1581f2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
@@ -139,11 +139,11 @@
       "6:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 7f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 8f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -156,87 +156,87 @@
       "9:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
       "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10]\n"
+      "fmla z8.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z10.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z11.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z8.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z9.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z10.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z11.s, z16.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[2]\n"
+      "fmla z9.s, z16.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.s, z17.s, z0.s[2]\n"
+      "fmla z11.s, z16.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[3]\n"
+      "fmla z9.s, z16.s, z0.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-1, MUL VL]\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z10.s, z17.s, z0.s[3]\n"
+      "fmla z11.s, z16.s, z0.s[3]\n"
       "add x26, x26, #0x10\n"
       "bgt 9b\n"
       "10:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
       "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10]\n"
+      "fmla z8.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z9.s, z16.s, z0.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z17.s, z0.s[0]\n"
+      "fmla z11.s, z16.s, z0.s[0]\n"
       "addvl x10, x10, #4\n"
       "ble 11f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[1]\n"
+      "fmla z9.s, z16.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z10.s, z17.s, z0.s[1]\n"
+      "fmla z11.s, z16.s, z0.s[1]\n"
       "addvl x10, x10, #4\n"
       "ble 11f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[2]\n"
+      "fmla z9.s, z16.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z10.s, z17.s, z0.s[2]\n"
+      "fmla z11.s, z16.s, z0.s[2]\n"
       "addvl x10, x10, #4\n"
       "ble 11f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[3]\n"
+      "fmla z9.s, z16.s, z0.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z17.s, z0.s[3]\n"
+      "fmla z11.s, z16.s, z0.s[3]\n"
       "addvl x10, x10, #4\n"
       "11:"  // Height 1: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -245,17 +245,17 @@
       "bne 6b\n"
       "tbz %x[flags], #1, 12f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
       "12:"  // Height 1: No activation
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -295,15 +295,15 @@
       "16:"  // Height 2: no bias
       "tbz %x[flags], #0, 17f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x9]\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 18f\n"
       "17:"  // Height 2: no accumulate
       "mov z8.b, #0x0\n"
@@ -319,12 +319,12 @@
       "19:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 21f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -332,130 +332,130 @@
       "b 21f\n"
       "20:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
       "21:"  // Height 2: input setup done
       "cmp x27, #0x4\n"
       "ble 23f\n"
       "22:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "ld1rqw { z1.s }, p0/Z, [x26]\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "sub x27, x27, #0x4\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z1.s[0]\n"
+      "fmla z12.s, z17.s, z0.s[0]\n"
+      "fmla z9.s, z16.s, z1.s[0]\n"
+      "fmla z13.s, z16.s, z0.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z17.s, z1.s[0]\n"
+      "fmla z14.s, z17.s, z0.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #4, MUL VL]\n"
       "cmp x27, #0x4\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z11.s, z16.s, z1.s[0]\n"
+      "fmla z15.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #5, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z8.s, z17.s, z1.s[1]\n"
+      "fmla z12.s, z17.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #6, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z9.s, z16.s, z1.s[1]\n"
+      "fmla z13.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z10.s, z17.s, z1.s[1]\n"
+      "fmla z14.s, z17.s, z0.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.s, z16.s, z1.s[1]\n"
+      "fmla z15.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.s, z17.s, z1.s[2]\n"
+      "fmla z12.s, z17.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.s, z16.s, z1.s[2]\n"
+      "fmla z13.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.s, z17.s, z1.s[2]\n"
+      "fmla z14.s, z17.s, z0.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.s, z16.s, z1.s[2]\n"
+      "fmla z15.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.s, z17.s, z1.s[3]\n"
+      "fmla z12.s, z17.s, z0.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.s, z16.s, z1.s[3]\n"
+      "fmla z13.s, z16.s, z0.s[3]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.s, z17.s, z1.s[3]\n"
+      "fmla z14.s, z17.s, z0.s[3]\n"
+      "fmla z11.s, z16.s, z1.s[3]\n"
+      "fmla z15.s, z16.s, z0.s[3]\n"
       "bgt 22b\n"
       "23:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
       "ld1rqw { z0.s }, p0/Z, [x26]\n"
       "ld1rqw { z1.s }, p0/Z, [x25]\n"
       "subs x27, x27, #0x1\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[0]\n"
+      "fmla z12.s, z17.s, z1.s[0]\n"
+      "fmla z9.s, z16.s, z0.s[0]\n"
+      "fmla z13.s, z16.s, z1.s[0]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z17.s, z0.s[0]\n"
+      "fmla z14.s, z17.s, z1.s[0]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z11.s, z16.s, z0.s[0]\n"
+      "fmla z15.s, z16.s, z1.s[0]\n"
       "ble 24f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[1]\n"
+      "fmla z12.s, z17.s, z1.s[1]\n"
+      "fmla z9.s, z16.s, z0.s[1]\n"
+      "fmla z13.s, z16.s, z1.s[1]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z10.s, z17.s, z0.s[1]\n"
+      "fmla z14.s, z17.s, z1.s[1]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z11.s, z16.s, z0.s[1]\n"
+      "fmla z15.s, z16.s, z1.s[1]\n"
       "ble 24f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[2]\n"
+      "fmla z12.s, z17.s, z1.s[2]\n"
+      "fmla z9.s, z16.s, z0.s[2]\n"
+      "fmla z13.s, z16.s, z1.s[2]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z10.s, z17.s, z0.s[2]\n"
+      "fmla z14.s, z17.s, z1.s[2]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z11.s, z16.s, z0.s[2]\n"
+      "fmla z15.s, z16.s, z1.s[2]\n"
       "ble 24f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z17.s, z0.s[3]\n"
+      "fmla z12.s, z17.s, z1.s[3]\n"
+      "fmla z9.s, z16.s, z0.s[3]\n"
+      "fmla z13.s, z16.s, z1.s[3]\n"
+      "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z17.s, z0.s[3]\n"
+      "fmla z14.s, z17.s, z1.s[3]\n"
       "addvl x10, x10, #4\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z11.s, z16.s, z0.s[3]\n"
+      "fmla z15.s, z16.s, z1.s[3]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -465,25 +465,25 @@
       "add x25, x9, x20, LSL #2\n"
       "tbz %x[flags], #1, 25f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmin z12.s, p5/M, z12.s, z17.s\n"
+      "fmin z13.s, p5/M, z13.s, z17.s\n"
+      "fmin z14.s, p5/M, z14.s, z17.s\n"
+      "fmin z15.s, p5/M, z15.s, z17.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
+      "fmax z12.s, p5/M, z12.s, z16.s\n"
+      "fmax z13.s, p5/M, z13.s, z16.s\n"
+      "fmax z14.s, p5/M, z14.s, z16.s\n"
+      "fmax z15.s, p5/M, z15.s, z16.s\n"
       "25:"  // Height 2: No activation
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -531,20 +531,20 @@
       "29:"  // Height 3: no bias
       "tbz %x[flags], #0, 30f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x9]\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x24]\n"
-      "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x20]\n"
+      "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 31f\n"
       "30:"  // Height 3: no accumulate
       "mov z8.b, #0x0\n"
@@ -564,13 +564,13 @@
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 34f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -579,86 +579,86 @@
       "b 34f\n"
       "33:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "34:"  // Height 3: input setup done
       "cmp x27, #0x4\n"
       "ble 36f\n"
       "35:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
       "ld1rqw { z1.s }, p0/Z, [x25]\n"
       "sub x27, x27, #0x4\n"
-      "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1rqw { z0.s }, p0/Z, [x24]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "fmla z8.s, z21.s, z2.s[0]\n"
+      "fmla z12.s, z21.s, z1.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.s, z21.s, z0.s[0]\n"
+      "fmla z9.s, z20.s, z2.s[0]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[0]\n"
+      "fmla z17.s, z20.s, z0.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
       "cmp x27, #0x4\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z10.s, z21.s, z2.s[0]\n"
+      "fmla z14.s, z21.s, z1.s[0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z18.s, z21.s, z0.s[0]\n"
+      "fmla z11.s, z20.s, z2.s[0]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z15.s, z20.s, z1.s[0]\n"
+      "fmla z19.s, z20.s, z0.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.s, z21.s, z2.s[1]\n"
+      "fmla z12.s, z21.s, z1.s[1]\n"
+      "fmla z16.s, z21.s, z0.s[1]\n"
+      "fmla z9.s, z20.s, z2.s[1]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[1]\n"
+      "fmla z17.s, z20.s, z0.s[1]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z10.s, z21.s, z2.s[1]\n"
+      "fmla z14.s, z21.s, z1.s[1]\n"
+      "fmla z18.s, z21.s, z0.s[1]\n"
+      "fmla z11.s, z20.s, z2.s[1]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z15.s, z20.s, z1.s[1]\n"
+      "fmla z19.s, z20.s, z0.s[1]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.s, z21.s, z2.s[2]\n"
+      "fmla z12.s, z21.s, z1.s[2]\n"
+      "fmla z16.s, z21.s, z0.s[2]\n"
+      "fmla z9.s, z20.s, z2.s[2]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[2]\n"
+      "fmla z17.s, z20.s, z0.s[2]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.s, z21.s, z2.s[2]\n"
+      "fmla z14.s, z21.s, z1.s[2]\n"
+      "fmla z18.s, z21.s, z0.s[2]\n"
+      "fmla z11.s, z20.s, z2.s[2]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z15.s, z20.s, z1.s[2]\n"
+      "fmla z19.s, z20.s, z0.s[2]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.s, z21.s, z2.s[3]\n"
+      "fmla z12.s, z21.s, z1.s[3]\n"
+      "fmla z16.s, z21.s, z0.s[3]\n"
+      "fmla z9.s, z20.s, z2.s[3]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[3]\n"
+      "fmla z17.s, z20.s, z0.s[3]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.s, z21.s, z2.s[3]\n"
+      "fmla z14.s, z21.s, z1.s[3]\n"
+      "fmla z18.s, z21.s, z0.s[3]\n"
+      "fmla z11.s, z20.s, z2.s[3]\n"
+      "fmla z15.s, z20.s, z1.s[3]\n"
+      "fmla z19.s, z20.s, z0.s[3]\n"
       "bgt 35b\n"
       "36:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
@@ -666,79 +666,79 @@
       "ld1rqw { z1.s }, p0/Z, [x25]\n"
       "subs x27, x27, #0x1\n"
       "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "fmla z8.s, z21.s, z0.s[0]\n"
+      "fmla z12.s, z21.s, z1.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.s, z21.s, z2.s[0]\n"
+      "fmla z9.s, z20.s, z0.s[0]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[0]\n"
+      "fmla z17.s, z20.s, z2.s[0]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z10.s, z21.s, z0.s[0]\n"
+      "fmla z14.s, z21.s, z1.s[0]\n"
+      "fmla z18.s, z21.s, z2.s[0]\n"
+      "fmla z11.s, z20.s, z0.s[0]\n"
+      "fmla z15.s, z20.s, z1.s[0]\n"
+      "fmla z19.s, z20.s, z2.s[0]\n"
       "ble 37f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z21.s, z0.s[1]\n"
+      "fmla z12.s, z21.s, z1.s[1]\n"
+      "fmla z16.s, z21.s, z2.s[1]\n"
+      "fmla z9.s, z20.s, z0.s[1]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[1]\n"
+      "fmla z17.s, z20.s, z2.s[1]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z10.s, z21.s, z0.s[1]\n"
+      "fmla z14.s, z21.s, z1.s[1]\n"
+      "fmla z18.s, z21.s, z2.s[1]\n"
+      "fmla z11.s, z20.s, z0.s[1]\n"
+      "fmla z15.s, z20.s, z1.s[1]\n"
+      "fmla z19.s, z20.s, z2.s[1]\n"
       "ble 37f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z21.s, z0.s[2]\n"
+      "fmla z12.s, z21.s, z1.s[2]\n"
+      "fmla z16.s, z21.s, z2.s[2]\n"
+      "fmla z9.s, z20.s, z0.s[2]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[2]\n"
+      "fmla z17.s, z20.s, z2.s[2]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z10.s, z21.s, z0.s[2]\n"
+      "fmla z14.s, z21.s, z1.s[2]\n"
+      "fmla z18.s, z21.s, z2.s[2]\n"
+      "fmla z11.s, z20.s, z0.s[2]\n"
+      "fmla z15.s, z20.s, z1.s[2]\n"
+      "fmla z19.s, z20.s, z2.s[2]\n"
       "ble 37f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z21.s }, p5/Z, [x10]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z21.s, z0.s[3]\n"
+      "fmla z12.s, z21.s, z1.s[3]\n"
+      "fmla z16.s, z21.s, z2.s[3]\n"
+      "fmla z9.s, z20.s, z0.s[3]\n"
+      "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z20.s, z1.s[3]\n"
+      "fmla z17.s, z20.s, z2.s[3]\n"
+      "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z10.s, z21.s, z0.s[3]\n"
+      "fmla z14.s, z21.s, z1.s[3]\n"
+      "fmla z18.s, z21.s, z2.s[3]\n"
+      "fmla z11.s, z20.s, z0.s[3]\n"
+      "fmla z15.s, z20.s, z1.s[3]\n"
+      "fmla z19.s, z20.s, z2.s[3]\n"
       "37:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -749,33 +749,33 @@
       "add x24, x25, x20, LSL #2\n"
       "tbz %x[flags], #1, 38f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z21.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "ld1rw { z20.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z21.s\n"
+      "fmin z9.s, p5/M, z9.s, z21.s\n"
+      "fmin z10.s, p5/M, z10.s, z21.s\n"
+      "fmin z11.s, p5/M, z11.s, z21.s\n"
+      "fmin z12.s, p5/M, z12.s, z21.s\n"
+      "fmin z13.s, p5/M, z13.s, z21.s\n"
+      "fmin z14.s, p5/M, z14.s, z21.s\n"
+      "fmin z15.s, p5/M, z15.s, z21.s\n"
+      "fmin z16.s, p5/M, z16.s, z21.s\n"
+      "fmin z17.s, p5/M, z17.s, z21.s\n"
+      "fmin z18.s, p5/M, z18.s, z21.s\n"
+      "fmin z19.s, p5/M, z19.s, z21.s\n"
+      "fmax z8.s, p5/M, z8.s, z20.s\n"
+      "fmax z9.s, p5/M, z9.s, z20.s\n"
+      "fmax z10.s, p5/M, z10.s, z20.s\n"
+      "fmax z11.s, p5/M, z11.s, z20.s\n"
+      "fmax z12.s, p5/M, z12.s, z20.s\n"
+      "fmax z13.s, p5/M, z13.s, z20.s\n"
+      "fmax z14.s, p5/M, z14.s, z20.s\n"
+      "fmax z15.s, p5/M, z15.s, z20.s\n"
+      "fmax z16.s, p5/M, z16.s, z20.s\n"
+      "fmax z17.s, p5/M, z17.s, z20.s\n"
+      "fmax z18.s, p5/M, z18.s, z20.s\n"
+      "fmax z19.s, p5/M, z19.s, z20.s\n"
       "38:"  // Height 3: No activation
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -831,25 +831,25 @@
       "42:"  // Height 4: no bias
       "tbz %x[flags], #0, 43f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x9]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x24]\n"
-      "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x21]\n"
+      "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 44f\n"
       "43:"  // Height 4: no accumulate
       "mov z8.b, #0x0\n"
@@ -873,14 +873,14 @@
       "45:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 46f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -890,105 +890,105 @@
       "b 47f\n"
       "46:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "47:"  // Height 4: input setup done
       "cmp x27, #0x4\n"
       "ble 49f\n"
       "48:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "ld1rqw { z3.s }, p0/Z, [x26]\n"
+      "ld1rqw { z2.s }, p0/Z, [x25]\n"
       "sub x27, x27, #0x4\n"
-      "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      "ld1rqw { z3.s }, p0/Z, [x23]\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
       "cmp x27, #0x4\n"
       "add x26, x26, #0x10\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z25.s, z3.s[0]\n"
+      "fmla z12.s, z25.s, z2.s[0]\n"
+      "fmla z16.s, z25.s, z1.s[0]\n"
+      "fmla z20.s, z25.s, z0.s[0]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
+      "fmla z9.s, z24.s, z3.s[0]\n"
+      "fmla z13.s, z24.s, z2.s[0]\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z17.s, z24.s, z1.s[0]\n"
+      "fmla z21.s, z24.s, z0.s[0]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z25.s, z3.s[0]\n"
+      "fmla z14.s, z25.s, z2.s[0]\n"
+      "fmla z18.s, z25.s, z1.s[0]\n"
+      "fmla z22.s, z25.s, z0.s[0]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z11.s, z24.s, z3.s[0]\n"
+      "fmla z15.s, z24.s, z2.s[0]\n"
+      "fmla z19.s, z24.s, z1.s[0]\n"
+      "fmla z23.s, z24.s, z0.s[0]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.s, z25.s, z3.s[1]\n"
+      "fmla z12.s, z25.s, z2.s[1]\n"
+      "fmla z16.s, z25.s, z1.s[1]\n"
+      "fmla z20.s, z25.s, z0.s[1]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.s, z24.s, z3.s[1]\n"
+      "fmla z13.s, z24.s, z2.s[1]\n"
+      "fmla z17.s, z24.s, z1.s[1]\n"
+      "fmla z21.s, z24.s, z0.s[1]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
+      "fmla z10.s, z25.s, z3.s[1]\n"
+      "fmla z14.s, z25.s, z2.s[1]\n"
+      "fmla z18.s, z25.s, z1.s[1]\n"
+      "fmla z22.s, z25.s, z0.s[1]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.s, z24.s, z3.s[1]\n"
+      "fmla z15.s, z24.s, z2.s[1]\n"
+      "fmla z19.s, z24.s, z1.s[1]\n"
+      "fmla z23.s, z24.s, z0.s[1]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.s, z25.s, z3.s[2]\n"
+      "fmla z12.s, z25.s, z2.s[2]\n"
+      "fmla z16.s, z25.s, z1.s[2]\n"
+      "fmla z20.s, z25.s, z0.s[2]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.s, z24.s, z3.s[2]\n"
+      "fmla z13.s, z24.s, z2.s[2]\n"
+      "fmla z17.s, z24.s, z1.s[2]\n"
+      "fmla z21.s, z24.s, z0.s[2]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.s, z25.s, z3.s[2]\n"
+      "fmla z14.s, z25.s, z2.s[2]\n"
+      "fmla z18.s, z25.s, z1.s[2]\n"
+      "fmla z22.s, z25.s, z0.s[2]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.s, z24.s, z3.s[2]\n"
+      "fmla z15.s, z24.s, z2.s[2]\n"
+      "fmla z19.s, z24.s, z1.s[2]\n"
+      "fmla z23.s, z24.s, z0.s[2]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.s, z25.s, z3.s[3]\n"
+      "fmla z12.s, z25.s, z2.s[3]\n"
+      "fmla z16.s, z25.s, z1.s[3]\n"
+      "fmla z20.s, z25.s, z0.s[3]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.s, z24.s, z3.s[3]\n"
+      "fmla z13.s, z24.s, z2.s[3]\n"
+      "fmla z17.s, z24.s, z1.s[3]\n"
+      "fmla z21.s, z24.s, z0.s[3]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.s, z25.s, z3.s[3]\n"
+      "fmla z14.s, z25.s, z2.s[3]\n"
+      "fmla z18.s, z25.s, z1.s[3]\n"
+      "fmla z22.s, z25.s, z0.s[3]\n"
+      "fmla z11.s, z24.s, z3.s[3]\n"
+      "fmla z15.s, z24.s, z2.s[3]\n"
+      "fmla z19.s, z24.s, z1.s[3]\n"
+      "fmla z23.s, z24.s, z0.s[3]\n"
       "bgt 48b\n"
       "49:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
@@ -997,95 +997,95 @@
       "subs x27, x27, #0x1\n"
       "ld1rqw { z2.s }, p0/Z, [x24]\n"
       "ld1rqw { z3.s }, p0/Z, [x23]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z25.s, z0.s[0]\n"
+      "fmla z12.s, z25.s, z1.s[0]\n"
+      "fmla z16.s, z25.s, z2.s[0]\n"
+      "fmla z20.s, z25.s, z3.s[0]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z24.s, z0.s[0]\n"
+      "fmla z13.s, z24.s, z1.s[0]\n"
+      "fmla z17.s, z24.s, z2.s[0]\n"
+      "fmla z21.s, z24.s, z3.s[0]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
+      "fmla z10.s, z25.s, z0.s[0]\n"
+      "fmla z14.s, z25.s, z1.s[0]\n"
+      "fmla z18.s, z25.s, z2.s[0]\n"
+      "fmla z22.s, z25.s, z3.s[0]\n"
+      "fmla z11.s, z24.s, z0.s[0]\n"
+      "fmla z15.s, z24.s, z1.s[0]\n"
+      "fmla z19.s, z24.s, z2.s[0]\n"
+      "fmla z23.s, z24.s, z3.s[0]\n"
       "ble 50f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z25.s, z0.s[1]\n"
+      "fmla z12.s, z25.s, z1.s[1]\n"
+      "fmla z16.s, z25.s, z2.s[1]\n"
+      "fmla z20.s, z25.s, z3.s[1]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z9.s, z24.s, z0.s[1]\n"
+      "fmla z13.s, z24.s, z1.s[1]\n"
+      "fmla z17.s, z24.s, z2.s[1]\n"
+      "fmla z21.s, z24.s, z3.s[1]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
+      "fmla z10.s, z25.s, z0.s[1]\n"
+      "fmla z14.s, z25.s, z1.s[1]\n"
+      "fmla z18.s, z25.s, z2.s[1]\n"
+      "fmla z22.s, z25.s, z3.s[1]\n"
+      "fmla z11.s, z24.s, z0.s[1]\n"
+      "fmla z15.s, z24.s, z1.s[1]\n"
+      "fmla z19.s, z24.s, z2.s[1]\n"
+      "fmla z23.s, z24.s, z3.s[1]\n"
       "ble 50f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z25.s, z0.s[2]\n"
+      "fmla z12.s, z25.s, z1.s[2]\n"
+      "fmla z16.s, z25.s, z2.s[2]\n"
+      "fmla z20.s, z25.s, z3.s[2]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z9.s, z24.s, z0.s[2]\n"
+      "fmla z13.s, z24.s, z1.s[2]\n"
+      "fmla z17.s, z24.s, z2.s[2]\n"
+      "fmla z21.s, z24.s, z3.s[2]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
+      "fmla z10.s, z25.s, z0.s[2]\n"
+      "fmla z14.s, z25.s, z1.s[2]\n"
+      "fmla z18.s, z25.s, z2.s[2]\n"
+      "fmla z22.s, z25.s, z3.s[2]\n"
+      "fmla z11.s, z24.s, z0.s[2]\n"
+      "fmla z15.s, z24.s, z1.s[2]\n"
+      "fmla z19.s, z24.s, z2.s[2]\n"
+      "fmla z23.s, z24.s, z3.s[2]\n"
       "ble 50f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z25.s }, p5/Z, [x10]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z25.s, z0.s[3]\n"
+      "fmla z12.s, z25.s, z1.s[3]\n"
+      "fmla z16.s, z25.s, z2.s[3]\n"
+      "fmla z20.s, z25.s, z3.s[3]\n"
+      "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z24.s, z0.s[3]\n"
+      "fmla z13.s, z24.s, z1.s[3]\n"
+      "fmla z17.s, z24.s, z2.s[3]\n"
+      "fmla z21.s, z24.s, z3.s[3]\n"
+      "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
+      "fmla z10.s, z25.s, z0.s[3]\n"
+      "fmla z14.s, z25.s, z1.s[3]\n"
+      "fmla z18.s, z25.s, z2.s[3]\n"
+      "fmla z22.s, z25.s, z3.s[3]\n"
+      "fmla z11.s, z24.s, z0.s[3]\n"
+      "fmla z15.s, z24.s, z1.s[3]\n"
+      "fmla z19.s, z24.s, z2.s[3]\n"
+      "fmla z23.s, z24.s, z3.s[3]\n"
       "50:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1097,41 +1097,41 @@
       "add x23, x24, x20, LSL #2\n"
       "tbz %x[flags], #1, 51f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z25.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmin z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z1.s\n"
-      "fmin z22.s, p5/M, z22.s, z1.s\n"
-      "fmin z23.s, p5/M, z23.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
-      "fmax z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z0.s\n"
-      "fmax z22.s, p5/M, z22.s, z0.s\n"
-      "fmax z23.s, p5/M, z23.s, z0.s\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z25.s\n"
+      "fmin z9.s, p5/M, z9.s, z25.s\n"
+      "fmin z10.s, p5/M, z10.s, z25.s\n"
+      "fmin z11.s, p5/M, z11.s, z25.s\n"
+      "fmin z12.s, p5/M, z12.s, z25.s\n"
+      "fmin z13.s, p5/M, z13.s, z25.s\n"
+      "fmin z14.s, p5/M, z14.s, z25.s\n"
+      "fmin z15.s, p5/M, z15.s, z25.s\n"
+      "fmin z16.s, p5/M, z16.s, z25.s\n"
+      "fmin z17.s, p5/M, z17.s, z25.s\n"
+      "fmin z18.s, p5/M, z18.s, z25.s\n"
+      "fmin z19.s, p5/M, z19.s, z25.s\n"
+      "fmin z20.s, p5/M, z20.s, z25.s\n"
+      "fmin z21.s, p5/M, z21.s, z25.s\n"
+      "fmin z22.s, p5/M, z22.s, z25.s\n"
+      "fmin z23.s, p5/M, z23.s, z25.s\n"
+      "fmax z8.s, p5/M, z8.s, z24.s\n"
+      "fmax z9.s, p5/M, z9.s, z24.s\n"
+      "fmax z10.s, p5/M, z10.s, z24.s\n"
+      "fmax z11.s, p5/M, z11.s, z24.s\n"
+      "fmax z12.s, p5/M, z12.s, z24.s\n"
+      "fmax z13.s, p5/M, z13.s, z24.s\n"
+      "fmax z14.s, p5/M, z14.s, z24.s\n"
+      "fmax z15.s, p5/M, z15.s, z24.s\n"
+      "fmax z16.s, p5/M, z16.s, z24.s\n"
+      "fmax z17.s, p5/M, z17.s, z24.s\n"
+      "fmax z18.s, p5/M, z18.s, z24.s\n"
+      "fmax z19.s, p5/M, z19.s, z24.s\n"
+      "fmax z20.s, p5/M, z20.s, z24.s\n"
+      "fmax z21.s, p5/M, z21.s, z24.s\n"
+      "fmax z22.s, p5/M, z22.s, z24.s\n"
+      "fmax z23.s, p5/M, z23.s, z24.s\n"
       "51:"  // Height 4: No activation
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -1195,30 +1195,30 @@
       "55:"  // Height 5: no bias
       "tbz %x[flags], #0, 56f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z8.s }, p4/Z, [x9]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x24]\n"
-      "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z24.s }, p4/Z, [x22]\n"
-      "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x22]\n"
+      "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x20]\n"
+      "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 57f\n"
       "56:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -1246,15 +1246,15 @@
       "58:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 59f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 60f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -1265,124 +1265,124 @@
       "b 60f\n"
       "59:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "60:"  // Height 5: input setup done
       "cmp x27, #0x4\n"
       "ble 62f\n"
       "61:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "ld1rqw { z3.s }, p0/Z, [x25]\n"
       "sub x27, x27, #0x4\n"
       "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      "ld1rqw { z3.s }, p0/Z, [x23]\n"
+      "ld1rqw { z1.s }, p0/Z, [x23]\n"
       "cmp x27, #0x4\n"
       "add x26, x26, #0x10\n"
-      "ld1rqw { z4.s }, p0/Z, [x22]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
+      "ld1rqw { z0.s }, p0/Z, [x22]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "fmla z8.s, z29.s, z4.s[0]\n"
+      "fmla z12.s, z29.s, z3.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.s, z29.s, z2.s[0]\n"
+      "fmla z20.s, z29.s, z1.s[0]\n"
       "add x25, x25, #0x10\n"
-      "fmla z24.s, z6.s, z4.s[0]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z24.s, z29.s, z0.s[0]\n"
+      "fmla z9.s, z28.s, z4.s[0]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
+      "fmla z13.s, z28.s, z3.s[0]\n"
+      "fmla z17.s, z28.s, z2.s[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "fmla z25.s, z7.s, z4.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z26.s, z6.s, z4.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "fmla z27.s, z7.s, z4.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
-      "fmla z24.s, z6.s, z4.s[1]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "fmla z25.s, z7.s, z4.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z21.s, z28.s, z1.s[0]\n"
+      "fmla z25.s, z28.s, z0.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z29.s, z4.s[0]\n"
+      "fmla z14.s, z29.s, z3.s[0]\n"
+      "fmla z18.s, z29.s, z2.s[0]\n"
+      "fmla z22.s, z29.s, z1.s[0]\n"
+      "fmla z26.s, z29.s, z0.s[0]\n"
+      "fmla z11.s, z28.s, z4.s[0]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z15.s, z28.s, z3.s[0]\n"
+      "fmla z19.s, z28.s, z2.s[0]\n"
+      "fmla z23.s, z28.s, z1.s[0]\n"
+      "fmla z27.s, z28.s, z0.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.s, z29.s, z4.s[1]\n"
+      "fmla z12.s, z29.s, z3.s[1]\n"
+      "fmla z16.s, z29.s, z2.s[1]\n"
+      "fmla z20.s, z29.s, z1.s[1]\n"
+      "fmla z24.s, z29.s, z0.s[1]\n"
+      "fmla z9.s, z28.s, z4.s[1]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z13.s, z28.s, z3.s[1]\n"
+      "fmla z17.s, z28.s, z2.s[1]\n"
+      "fmla z21.s, z28.s, z1.s[1]\n"
+      "fmla z25.s, z28.s, z0.s[1]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z26.s, z6.s, z4.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "fmla z27.s, z7.s, z4.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
-      "fmla z24.s, z6.s, z4.s[2]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "fmla z25.s, z7.s, z4.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z26.s, z6.s, z4.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "fmla z27.s, z7.s, z4.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "fmla z24.s, z6.s, z4.s[3]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "fmla z25.s, z7.s, z4.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z26.s, z6.s, z4.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
-      "fmla z27.s, z7.s, z4.s[3]\n"
+      "fmla z10.s, z29.s, z4.s[1]\n"
+      "fmla z14.s, z29.s, z3.s[1]\n"
+      "fmla z18.s, z29.s, z2.s[1]\n"
+      "fmla z22.s, z29.s, z1.s[1]\n"
+      "fmla z26.s, z29.s, z0.s[1]\n"
+      "fmla z11.s, z28.s, z4.s[1]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z15.s, z28.s, z3.s[1]\n"
+      "fmla z19.s, z28.s, z2.s[1]\n"
+      "fmla z23.s, z28.s, z1.s[1]\n"
+      "fmla z27.s, z28.s, z0.s[1]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.s, z29.s, z4.s[2]\n"
+      "fmla z12.s, z29.s, z3.s[2]\n"
+      "fmla z16.s, z29.s, z2.s[2]\n"
+      "fmla z20.s, z29.s, z1.s[2]\n"
+      "fmla z24.s, z29.s, z0.s[2]\n"
+      "fmla z9.s, z28.s, z4.s[2]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z13.s, z28.s, z3.s[2]\n"
+      "fmla z17.s, z28.s, z2.s[2]\n"
+      "fmla z21.s, z28.s, z1.s[2]\n"
+      "fmla z25.s, z28.s, z0.s[2]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.s, z29.s, z4.s[2]\n"
+      "fmla z14.s, z29.s, z3.s[2]\n"
+      "fmla z18.s, z29.s, z2.s[2]\n"
+      "fmla z22.s, z29.s, z1.s[2]\n"
+      "fmla z26.s, z29.s, z0.s[2]\n"
+      "fmla z11.s, z28.s, z4.s[2]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z15.s, z28.s, z3.s[2]\n"
+      "fmla z19.s, z28.s, z2.s[2]\n"
+      "fmla z23.s, z28.s, z1.s[2]\n"
+      "fmla z27.s, z28.s, z0.s[2]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.s, z29.s, z4.s[3]\n"
+      "fmla z12.s, z29.s, z3.s[3]\n"
+      "fmla z16.s, z29.s, z2.s[3]\n"
+      "fmla z20.s, z29.s, z1.s[3]\n"
+      "fmla z24.s, z29.s, z0.s[3]\n"
+      "fmla z9.s, z28.s, z4.s[3]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z13.s, z28.s, z3.s[3]\n"
+      "fmla z17.s, z28.s, z2.s[3]\n"
+      "fmla z21.s, z28.s, z1.s[3]\n"
+      "fmla z25.s, z28.s, z0.s[3]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.s, z29.s, z4.s[3]\n"
+      "fmla z14.s, z29.s, z3.s[3]\n"
+      "fmla z18.s, z29.s, z2.s[3]\n"
+      "fmla z22.s, z29.s, z1.s[3]\n"
+      "fmla z26.s, z29.s, z0.s[3]\n"
+      "fmla z11.s, z28.s, z4.s[3]\n"
+      "fmla z15.s, z28.s, z3.s[3]\n"
+      "fmla z19.s, z28.s, z2.s[3]\n"
+      "fmla z23.s, z28.s, z1.s[3]\n"
+      "fmla z27.s, z28.s, z0.s[3]\n"
       "bgt 61b\n"
       "62:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
@@ -1392,111 +1392,111 @@
       "ld1rqw { z2.s }, p0/Z, [x24]\n"
       "ld1rqw { z3.s }, p0/Z, [x23]\n"
       "ld1rqw { z4.s }, p0/Z, [x22]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
-      "fmla z24.s, z6.s, z4.s[0]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "fmla z25.s, z7.s, z4.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "fmla z8.s, z29.s, z0.s[0]\n"
+      "fmla z12.s, z29.s, z1.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z16.s, z29.s, z2.s[0]\n"
+      "fmla z20.s, z29.s, z3.s[0]\n"
+      "fmla z24.s, z29.s, z4.s[0]\n"
+      "fmla z9.s, z28.s, z0.s[0]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z28.s, z1.s[0]\n"
+      "fmla z17.s, z28.s, z2.s[0]\n"
+      "fmla z21.s, z28.s, z3.s[0]\n"
+      "fmla z25.s, z28.s, z4.s[0]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z26.s, z6.s, z4.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "fmla z27.s, z7.s, z4.s[0]\n"
+      "fmla z10.s, z29.s, z0.s[0]\n"
+      "fmla z14.s, z29.s, z1.s[0]\n"
+      "fmla z18.s, z29.s, z2.s[0]\n"
+      "fmla z22.s, z29.s, z3.s[0]\n"
+      "fmla z26.s, z29.s, z4.s[0]\n"
+      "fmla z11.s, z28.s, z0.s[0]\n"
+      "fmla z15.s, z28.s, z1.s[0]\n"
+      "fmla z19.s, z28.s, z2.s[0]\n"
+      "fmla z23.s, z28.s, z3.s[0]\n"
+      "fmla z27.s, z28.s, z4.s[0]\n"
       "ble 63f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z29.s, z0.s[1]\n"
+      "fmla z12.s, z29.s, z1.s[1]\n"
+      "fmla z16.s, z29.s, z2.s[1]\n"
+      "fmla z20.s, z29.s, z3.s[1]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.s, z6.s, z4.s[1]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "fmla z25.s, z7.s, z4.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.s, z29.s, z4.s[1]\n"
+      "fmla z9.s, z28.s, z0.s[1]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z28.s, z1.s[1]\n"
+      "fmla z17.s, z28.s, z2.s[1]\n"
+      "fmla z21.s, z28.s, z3.s[1]\n"
+      "fmla z25.s, z28.s, z4.s[1]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z26.s, z6.s, z4.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "fmla z27.s, z7.s, z4.s[1]\n"
+      "fmla z10.s, z29.s, z0.s[1]\n"
+      "fmla z14.s, z29.s, z1.s[1]\n"
+      "fmla z18.s, z29.s, z2.s[1]\n"
+      "fmla z22.s, z29.s, z3.s[1]\n"
+      "fmla z26.s, z29.s, z4.s[1]\n"
+      "fmla z11.s, z28.s, z0.s[1]\n"
+      "fmla z15.s, z28.s, z1.s[1]\n"
+      "fmla z19.s, z28.s, z2.s[1]\n"
+      "fmla z23.s, z28.s, z3.s[1]\n"
+      "fmla z27.s, z28.s, z4.s[1]\n"
       "ble 63f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z29.s, z0.s[2]\n"
+      "fmla z12.s, z29.s, z1.s[2]\n"
+      "fmla z16.s, z29.s, z2.s[2]\n"
+      "fmla z20.s, z29.s, z3.s[2]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.s, z6.s, z4.s[2]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "fmla z25.s, z7.s, z4.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.s, z29.s, z4.s[2]\n"
+      "fmla z9.s, z28.s, z0.s[2]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z28.s, z1.s[2]\n"
+      "fmla z17.s, z28.s, z2.s[2]\n"
+      "fmla z21.s, z28.s, z3.s[2]\n"
+      "fmla z25.s, z28.s, z4.s[2]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z26.s, z6.s, z4.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "fmla z27.s, z7.s, z4.s[2]\n"
+      "fmla z10.s, z29.s, z0.s[2]\n"
+      "fmla z14.s, z29.s, z1.s[2]\n"
+      "fmla z18.s, z29.s, z2.s[2]\n"
+      "fmla z22.s, z29.s, z3.s[2]\n"
+      "fmla z26.s, z29.s, z4.s[2]\n"
+      "fmla z11.s, z28.s, z0.s[2]\n"
+      "fmla z15.s, z28.s, z1.s[2]\n"
+      "fmla z19.s, z28.s, z2.s[2]\n"
+      "fmla z23.s, z28.s, z3.s[2]\n"
+      "fmla z27.s, z28.s, z4.s[2]\n"
       "ble 63f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "fmla z24.s, z6.s, z4.s[3]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "fmla z25.s, z7.s, z4.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z29.s }, p5/Z, [x10]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z29.s, z0.s[3]\n"
+      "fmla z12.s, z29.s, z1.s[3]\n"
+      "fmla z16.s, z29.s, z2.s[3]\n"
+      "fmla z20.s, z29.s, z3.s[3]\n"
+      "fmla z24.s, z29.s, z4.s[3]\n"
+      "fmla z9.s, z28.s, z0.s[3]\n"
+      "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z13.s, z28.s, z1.s[3]\n"
+      "fmla z17.s, z28.s, z2.s[3]\n"
+      "fmla z21.s, z28.s, z3.s[3]\n"
+      "fmla z25.s, z28.s, z4.s[3]\n"
+      "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z26.s, z6.s, z4.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
-      "fmla z27.s, z7.s, z4.s[3]\n"
+      "fmla z10.s, z29.s, z0.s[3]\n"
+      "fmla z14.s, z29.s, z1.s[3]\n"
+      "fmla z18.s, z29.s, z2.s[3]\n"
+      "fmla z22.s, z29.s, z3.s[3]\n"
+      "fmla z26.s, z29.s, z4.s[3]\n"
+      "fmla z11.s, z28.s, z0.s[3]\n"
+      "fmla z15.s, z28.s, z1.s[3]\n"
+      "fmla z19.s, z28.s, z2.s[3]\n"
+      "fmla z23.s, z28.s, z3.s[3]\n"
+      "fmla z27.s, z28.s, z4.s[3]\n"
       "63:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1509,49 +1509,49 @@
       "add x22, x23, x20, LSL #2\n"
       "tbz %x[flags], #1, 64f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z29.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmin z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z1.s\n"
-      "fmin z22.s, p5/M, z22.s, z1.s\n"
-      "fmin z23.s, p5/M, z23.s, z1.s\n"
-      "fmin z24.s, p5/M, z24.s, z1.s\n"
-      "fmin z25.s, p5/M, z25.s, z1.s\n"
-      "fmin z26.s, p5/M, z26.s, z1.s\n"
-      "fmin z27.s, p5/M, z27.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
-      "fmax z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z0.s\n"
-      "fmax z22.s, p5/M, z22.s, z0.s\n"
-      "fmax z23.s, p5/M, z23.s, z0.s\n"
-      "fmax z24.s, p5/M, z24.s, z0.s\n"
-      "fmax z25.s, p5/M, z25.s, z0.s\n"
-      "fmax z26.s, p5/M, z26.s, z0.s\n"
-      "fmax z27.s, p5/M, z27.s, z0.s\n"
+      "ld1rw { z28.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z29.s\n"
+      "fmin z9.s, p5/M, z9.s, z29.s\n"
+      "fmin z10.s, p5/M, z10.s, z29.s\n"
+      "fmin z11.s, p5/M, z11.s, z29.s\n"
+      "fmin z12.s, p5/M, z12.s, z29.s\n"
+      "fmin z13.s, p5/M, z13.s, z29.s\n"
+      "fmin z14.s, p5/M, z14.s, z29.s\n"
+      "fmin z15.s, p5/M, z15.s, z29.s\n"
+      "fmin z16.s, p5/M, z16.s, z29.s\n"
+      "fmin z17.s, p5/M, z17.s, z29.s\n"
+      "fmin z18.s, p5/M, z18.s, z29.s\n"
+      "fmin z19.s, p5/M, z19.s, z29.s\n"
+      "fmin z20.s, p5/M, z20.s, z29.s\n"
+      "fmin z21.s, p5/M, z21.s, z29.s\n"
+      "fmin z22.s, p5/M, z22.s, z29.s\n"
+      "fmin z23.s, p5/M, z23.s, z29.s\n"
+      "fmin z24.s, p5/M, z24.s, z29.s\n"
+      "fmin z25.s, p5/M, z25.s, z29.s\n"
+      "fmin z26.s, p5/M, z26.s, z29.s\n"
+      "fmin z27.s, p5/M, z27.s, z29.s\n"
+      "fmax z8.s, p5/M, z8.s, z28.s\n"
+      "fmax z9.s, p5/M, z9.s, z28.s\n"
+      "fmax z10.s, p5/M, z10.s, z28.s\n"
+      "fmax z11.s, p5/M, z11.s, z28.s\n"
+      "fmax z12.s, p5/M, z12.s, z28.s\n"
+      "fmax z13.s, p5/M, z13.s, z28.s\n"
+      "fmax z14.s, p5/M, z14.s, z28.s\n"
+      "fmax z15.s, p5/M, z15.s, z28.s\n"
+      "fmax z16.s, p5/M, z16.s, z28.s\n"
+      "fmax z17.s, p5/M, z17.s, z28.s\n"
+      "fmax z18.s, p5/M, z18.s, z28.s\n"
+      "fmax z19.s, p5/M, z19.s, z28.s\n"
+      "fmax z20.s, p5/M, z20.s, z28.s\n"
+      "fmax z21.s, p5/M, z21.s, z28.s\n"
+      "fmax z22.s, p5/M, z22.s, z28.s\n"
+      "fmax z23.s, p5/M, z23.s, z28.s\n"
+      "fmax z24.s, p5/M, z24.s, z28.s\n"
+      "fmax z25.s, p5/M, z25.s, z28.s\n"
+      "fmax z26.s, p5/M, z26.s, z28.s\n"
+      "fmax z27.s, p5/M, z27.s, z28.s\n"
       "64:"  // Height 5: No activation
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -1626,35 +1626,35 @@
       "68:"  // Height 6: no bias
       "tbz %x[flags], #0, 69f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x24, x9, x20, LSL #2\n"
       "add x23, x24, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
       "add x22, x23, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x24]\n"
-      "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z24.s }, p4/Z, [x22]\n"
-      "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p4/Z, [x21]\n"
-      "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x23]\n"
+      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x21]\n"
+      "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 70f\n"
       "69:"  // Height 6: no accumulate
       "mov z8.b, #0x0\n"
@@ -1686,16 +1686,16 @@
       "71:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 72f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 73f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -1707,143 +1707,143 @@
       "b 73f\n"
       "72:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "73:"  // Height 6: input setup done
       "cmp x27, #0x4\n"
       "ble 75f\n"
       "74:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
+      "ld1rqw { z7.s }, p0/Z, [x26]\n"
+      "ld1rqw { z6.s }, p0/Z, [x25]\n"
       "sub x27, x27, #0x4\n"
-      "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      "ld1rqw { z3.s }, p0/Z, [x23]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z4.s }, p0/Z, [x23]\n"
       "cmp x27, #0x4\n"
       "add x26, x26, #0x10\n"
-      "ld1rqw { z4.s }, p0/Z, [x22]\n"
-      "ld1rqw { z5.s }, p0/Z, [x21]\n"
+      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      "ld1rqw { z2.s }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
+      "ld1w { z1.s }, p5/Z, [x10]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z1.s, z7.s[0]\n"
+      "fmla z12.s, z1.s, z6.s[0]\n"
+      "fmla z16.s, z1.s, z5.s[0]\n"
+      "fmla z20.s, z1.s, z4.s[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "fmla z24.s, z6.s, z4.s[0]\n"
-      "fmla z28.s, z6.s, z5.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z24.s, z1.s, z3.s[0]\n"
+      "fmla z28.s, z1.s, z2.s[0]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #2, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "fmla z25.s, z7.s, z4.s[0]\n"
-      "fmla z29.s, z7.s, z5.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z26.s, z6.s, z4.s[0]\n"
-      "fmla z30.s, z6.s, z5.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "fmla z27.s, z7.s, z4.s[0]\n"
-      "fmla z31.s, z7.s, z5.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
-      "fmla z24.s, z6.s, z4.s[1]\n"
-      "fmla z28.s, z6.s, z5.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "fmla z25.s, z7.s, z4.s[1]\n"
-      "fmla z29.s, z7.s, z5.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+      "fmla z9.s, z0.s, z7.s[0]\n"
+      "fmla z13.s, z0.s, z6.s[0]\n"
+      "fmla z17.s, z0.s, z5.s[0]\n"
+      "fmla z21.s, z0.s, z4.s[0]\n"
+      "fmla z25.s, z0.s, z3.s[0]\n"
+      "fmla z29.s, z0.s, z2.s[0]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z10.s, z1.s, z7.s[0]\n"
+      "fmla z14.s, z1.s, z6.s[0]\n"
+      "fmla z18.s, z1.s, z5.s[0]\n"
+      "fmla z22.s, z1.s, z4.s[0]\n"
+      "fmla z26.s, z1.s, z3.s[0]\n"
+      "fmla z30.s, z1.s, z2.s[0]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #4, MUL VL]\n"
+      "fmla z11.s, z0.s, z7.s[0]\n"
+      "fmla z15.s, z0.s, z6.s[0]\n"
+      "fmla z19.s, z0.s, z5.s[0]\n"
+      "fmla z23.s, z0.s, z4.s[0]\n"
+      "fmla z27.s, z0.s, z3.s[0]\n"
+      "fmla z31.s, z0.s, z2.s[0]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #5, MUL VL]\n"
+      "fmla z8.s, z1.s, z7.s[1]\n"
+      "fmla z12.s, z1.s, z6.s[1]\n"
+      "fmla z16.s, z1.s, z5.s[1]\n"
+      "fmla z20.s, z1.s, z4.s[1]\n"
+      "fmla z24.s, z1.s, z3.s[1]\n"
+      "fmla z28.s, z1.s, z2.s[1]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #6, MUL VL]\n"
+      "fmla z9.s, z0.s, z7.s[1]\n"
+      "fmla z13.s, z0.s, z6.s[1]\n"
+      "fmla z17.s, z0.s, z5.s[1]\n"
+      "fmla z21.s, z0.s, z4.s[1]\n"
+      "fmla z25.s, z0.s, z3.s[1]\n"
+      "fmla z29.s, z0.s, z2.s[1]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z26.s, z6.s, z4.s[1]\n"
-      "fmla z30.s, z6.s, z5.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "fmla z27.s, z7.s, z4.s[1]\n"
-      "fmla z31.s, z7.s, z5.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
-      "fmla z24.s, z6.s, z4.s[2]\n"
-      "fmla z28.s, z6.s, z5.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "fmla z25.s, z7.s, z4.s[2]\n"
-      "fmla z29.s, z7.s, z5.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z26.s, z6.s, z4.s[2]\n"
-      "fmla z30.s, z6.s, z5.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "fmla z27.s, z7.s, z4.s[2]\n"
-      "fmla z31.s, z7.s, z5.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "fmla z24.s, z6.s, z4.s[3]\n"
-      "fmla z28.s, z6.s, z5.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "fmla z25.s, z7.s, z4.s[3]\n"
-      "fmla z29.s, z7.s, z5.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z26.s, z6.s, z4.s[3]\n"
-      "fmla z30.s, z6.s, z5.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
-      "fmla z27.s, z7.s, z4.s[3]\n"
-      "fmla z31.s, z7.s, z5.s[3]\n"
+      "fmla z10.s, z1.s, z7.s[1]\n"
+      "fmla z14.s, z1.s, z6.s[1]\n"
+      "fmla z18.s, z1.s, z5.s[1]\n"
+      "fmla z22.s, z1.s, z4.s[1]\n"
+      "fmla z26.s, z1.s, z3.s[1]\n"
+      "fmla z30.s, z1.s, z2.s[1]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #-8, MUL VL]\n"
+      "fmla z11.s, z0.s, z7.s[1]\n"
+      "fmla z15.s, z0.s, z6.s[1]\n"
+      "fmla z19.s, z0.s, z5.s[1]\n"
+      "fmla z23.s, z0.s, z4.s[1]\n"
+      "fmla z27.s, z0.s, z3.s[1]\n"
+      "fmla z31.s, z0.s, z2.s[1]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #-7, MUL VL]\n"
+      "fmla z8.s, z1.s, z7.s[2]\n"
+      "fmla z12.s, z1.s, z6.s[2]\n"
+      "fmla z16.s, z1.s, z5.s[2]\n"
+      "fmla z20.s, z1.s, z4.s[2]\n"
+      "fmla z24.s, z1.s, z3.s[2]\n"
+      "fmla z28.s, z1.s, z2.s[2]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #-6, MUL VL]\n"
+      "fmla z9.s, z0.s, z7.s[2]\n"
+      "fmla z13.s, z0.s, z6.s[2]\n"
+      "fmla z17.s, z0.s, z5.s[2]\n"
+      "fmla z21.s, z0.s, z4.s[2]\n"
+      "fmla z25.s, z0.s, z3.s[2]\n"
+      "fmla z29.s, z0.s, z2.s[2]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #-5, MUL VL]\n"
+      "fmla z10.s, z1.s, z7.s[2]\n"
+      "fmla z14.s, z1.s, z6.s[2]\n"
+      "fmla z18.s, z1.s, z5.s[2]\n"
+      "fmla z22.s, z1.s, z4.s[2]\n"
+      "fmla z26.s, z1.s, z3.s[2]\n"
+      "fmla z30.s, z1.s, z2.s[2]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #-4, MUL VL]\n"
+      "fmla z11.s, z0.s, z7.s[2]\n"
+      "fmla z15.s, z0.s, z6.s[2]\n"
+      "fmla z19.s, z0.s, z5.s[2]\n"
+      "fmla z23.s, z0.s, z4.s[2]\n"
+      "fmla z27.s, z0.s, z3.s[2]\n"
+      "fmla z31.s, z0.s, z2.s[2]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #-3, MUL VL]\n"
+      "fmla z8.s, z1.s, z7.s[3]\n"
+      "fmla z12.s, z1.s, z6.s[3]\n"
+      "fmla z16.s, z1.s, z5.s[3]\n"
+      "fmla z20.s, z1.s, z4.s[3]\n"
+      "fmla z24.s, z1.s, z3.s[3]\n"
+      "fmla z28.s, z1.s, z2.s[3]\n"
+      "ld1w { z1.s }, p5/Z, [x10, #-2, MUL VL]\n"
+      "fmla z9.s, z0.s, z7.s[3]\n"
+      "fmla z13.s, z0.s, z6.s[3]\n"
+      "fmla z17.s, z0.s, z5.s[3]\n"
+      "fmla z21.s, z0.s, z4.s[3]\n"
+      "fmla z25.s, z0.s, z3.s[3]\n"
+      "fmla z29.s, z0.s, z2.s[3]\n"
+      "ld1w { z0.s }, p5/Z, [x10, #-1, MUL VL]\n"
+      "fmla z10.s, z1.s, z7.s[3]\n"
+      "fmla z14.s, z1.s, z6.s[3]\n"
+      "fmla z18.s, z1.s, z5.s[3]\n"
+      "fmla z22.s, z1.s, z4.s[3]\n"
+      "fmla z26.s, z1.s, z3.s[3]\n"
+      "fmla z30.s, z1.s, z2.s[3]\n"
+      "fmla z11.s, z0.s, z7.s[3]\n"
+      "fmla z15.s, z0.s, z6.s[3]\n"
+      "fmla z19.s, z0.s, z5.s[3]\n"
+      "fmla z23.s, z0.s, z4.s[3]\n"
+      "fmla z27.s, z0.s, z3.s[3]\n"
+      "fmla z31.s, z0.s, z2.s[3]\n"
       "bgt 74b\n"
       "75:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
@@ -1854,127 +1854,127 @@
       "ld1rqw { z3.s }, p0/Z, [x23]\n"
       "ld1rqw { z4.s }, p0/Z, [x22]\n"
       "ld1rqw { z5.s }, p0/Z, [x21]\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[0]\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "fmla z20.s, z6.s, z3.s[0]\n"
-      "fmla z24.s, z6.s, z4.s[0]\n"
-      "fmla z28.s, z6.s, z5.s[0]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[0]\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "fmla z21.s, z7.s, z3.s[0]\n"
-      "fmla z25.s, z7.s, z4.s[0]\n"
-      "fmla z29.s, z7.s, z5.s[0]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z7.s, z0.s[0]\n"
+      "fmla z12.s, z7.s, z1.s[0]\n"
+      "fmla z16.s, z7.s, z2.s[0]\n"
+      "fmla z20.s, z7.s, z3.s[0]\n"
+      "fmla z24.s, z7.s, z4.s[0]\n"
+      "fmla z28.s, z7.s, z5.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z6.s, z0.s[0]\n"
+      "fmla z13.s, z6.s, z1.s[0]\n"
+      "fmla z17.s, z6.s, z2.s[0]\n"
+      "fmla z21.s, z6.s, z3.s[0]\n"
+      "fmla z25.s, z6.s, z4.s[0]\n"
+      "fmla z29.s, z6.s, z5.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[0]\n"
-      "fmla z14.s, z6.s, z1.s[0]\n"
-      "fmla z18.s, z6.s, z2.s[0]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z26.s, z6.s, z4.s[0]\n"
-      "fmla z30.s, z6.s, z5.s[0]\n"
-      "fmla z11.s, z7.s, z0.s[0]\n"
-      "fmla z15.s, z7.s, z1.s[0]\n"
-      "fmla z19.s, z7.s, z2.s[0]\n"
-      "fmla z23.s, z7.s, z3.s[0]\n"
-      "fmla z27.s, z7.s, z4.s[0]\n"
-      "fmla z31.s, z7.s, z5.s[0]\n"
+      "fmla z10.s, z7.s, z0.s[0]\n"
+      "fmla z14.s, z7.s, z1.s[0]\n"
+      "fmla z18.s, z7.s, z2.s[0]\n"
+      "fmla z22.s, z7.s, z3.s[0]\n"
+      "fmla z26.s, z7.s, z4.s[0]\n"
+      "fmla z30.s, z7.s, z5.s[0]\n"
+      "fmla z11.s, z6.s, z0.s[0]\n"
+      "fmla z15.s, z6.s, z1.s[0]\n"
+      "fmla z19.s, z6.s, z2.s[0]\n"
+      "fmla z23.s, z6.s, z3.s[0]\n"
+      "fmla z27.s, z6.s, z4.s[0]\n"
+      "fmla z31.s, z6.s, z5.s[0]\n"
       "ble 76f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[1]\n"
-      "fmla z12.s, z6.s, z1.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[1]\n"
-      "fmla z20.s, z6.s, z3.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z7.s, z0.s[1]\n"
+      "fmla z12.s, z7.s, z1.s[1]\n"
+      "fmla z16.s, z7.s, z2.s[1]\n"
+      "fmla z20.s, z7.s, z3.s[1]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.s, z6.s, z4.s[1]\n"
-      "fmla z28.s, z6.s, z5.s[1]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[1]\n"
-      "fmla z13.s, z7.s, z1.s[1]\n"
-      "fmla z17.s, z7.s, z2.s[1]\n"
-      "fmla z21.s, z7.s, z3.s[1]\n"
-      "fmla z25.s, z7.s, z4.s[1]\n"
-      "fmla z29.s, z7.s, z5.s[1]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.s, z7.s, z4.s[1]\n"
+      "fmla z28.s, z7.s, z5.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z6.s, z0.s[1]\n"
+      "fmla z13.s, z6.s, z1.s[1]\n"
+      "fmla z17.s, z6.s, z2.s[1]\n"
+      "fmla z21.s, z6.s, z3.s[1]\n"
+      "fmla z25.s, z6.s, z4.s[1]\n"
+      "fmla z29.s, z6.s, z5.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[1]\n"
-      "fmla z14.s, z6.s, z1.s[1]\n"
-      "fmla z18.s, z6.s, z2.s[1]\n"
-      "fmla z22.s, z6.s, z3.s[1]\n"
-      "fmla z26.s, z6.s, z4.s[1]\n"
-      "fmla z30.s, z6.s, z5.s[1]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z15.s, z7.s, z1.s[1]\n"
-      "fmla z19.s, z7.s, z2.s[1]\n"
-      "fmla z23.s, z7.s, z3.s[1]\n"
-      "fmla z27.s, z7.s, z4.s[1]\n"
-      "fmla z31.s, z7.s, z5.s[1]\n"
+      "fmla z10.s, z7.s, z0.s[1]\n"
+      "fmla z14.s, z7.s, z1.s[1]\n"
+      "fmla z18.s, z7.s, z2.s[1]\n"
+      "fmla z22.s, z7.s, z3.s[1]\n"
+      "fmla z26.s, z7.s, z4.s[1]\n"
+      "fmla z30.s, z7.s, z5.s[1]\n"
+      "fmla z11.s, z6.s, z0.s[1]\n"
+      "fmla z15.s, z6.s, z1.s[1]\n"
+      "fmla z19.s, z6.s, z2.s[1]\n"
+      "fmla z23.s, z6.s, z3.s[1]\n"
+      "fmla z27.s, z6.s, z4.s[1]\n"
+      "fmla z31.s, z6.s, z5.s[1]\n"
       "ble 76f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[2]\n"
-      "fmla z12.s, z6.s, z1.s[2]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z20.s, z6.s, z3.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z7.s, z0.s[2]\n"
+      "fmla z12.s, z7.s, z1.s[2]\n"
+      "fmla z16.s, z7.s, z2.s[2]\n"
+      "fmla z20.s, z7.s, z3.s[2]\n"
       "subs x27, x27, #0x1\n"
-      "fmla z24.s, z6.s, z4.s[2]\n"
-      "fmla z28.s, z6.s, z5.s[2]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[2]\n"
-      "fmla z13.s, z7.s, z1.s[2]\n"
-      "fmla z17.s, z7.s, z2.s[2]\n"
-      "fmla z21.s, z7.s, z3.s[2]\n"
-      "fmla z25.s, z7.s, z4.s[2]\n"
-      "fmla z29.s, z7.s, z5.s[2]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "fmla z24.s, z7.s, z4.s[2]\n"
+      "fmla z28.s, z7.s, z5.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z6.s, z0.s[2]\n"
+      "fmla z13.s, z6.s, z1.s[2]\n"
+      "fmla z17.s, z6.s, z2.s[2]\n"
+      "fmla z21.s, z6.s, z3.s[2]\n"
+      "fmla z25.s, z6.s, z4.s[2]\n"
+      "fmla z29.s, z6.s, z5.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[2]\n"
-      "fmla z14.s, z6.s, z1.s[2]\n"
-      "fmla z18.s, z6.s, z2.s[2]\n"
-      "fmla z22.s, z6.s, z3.s[2]\n"
-      "fmla z26.s, z6.s, z4.s[2]\n"
-      "fmla z30.s, z6.s, z5.s[2]\n"
-      "fmla z11.s, z7.s, z0.s[2]\n"
-      "fmla z15.s, z7.s, z1.s[2]\n"
-      "fmla z19.s, z7.s, z2.s[2]\n"
-      "fmla z23.s, z7.s, z3.s[2]\n"
-      "fmla z27.s, z7.s, z4.s[2]\n"
-      "fmla z31.s, z7.s, z5.s[2]\n"
+      "fmla z10.s, z7.s, z0.s[2]\n"
+      "fmla z14.s, z7.s, z1.s[2]\n"
+      "fmla z18.s, z7.s, z2.s[2]\n"
+      "fmla z22.s, z7.s, z3.s[2]\n"
+      "fmla z26.s, z7.s, z4.s[2]\n"
+      "fmla z30.s, z7.s, z5.s[2]\n"
+      "fmla z11.s, z6.s, z0.s[2]\n"
+      "fmla z15.s, z6.s, z1.s[2]\n"
+      "fmla z19.s, z6.s, z2.s[2]\n"
+      "fmla z23.s, z6.s, z3.s[2]\n"
+      "fmla z27.s, z6.s, z4.s[2]\n"
+      "fmla z31.s, z6.s, z5.s[2]\n"
       "ble 76f\n"
-      "ld1w { z6.s }, p5/Z, [x10]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
-      "fmla z8.s, z6.s, z0.s[3]\n"
-      "fmla z12.s, z6.s, z1.s[3]\n"
-      "fmla z16.s, z6.s, z2.s[3]\n"
-      "fmla z20.s, z6.s, z3.s[3]\n"
-      "fmla z24.s, z6.s, z4.s[3]\n"
-      "fmla z28.s, z6.s, z5.s[3]\n"
-      "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z9.s, z7.s, z0.s[3]\n"
-      "fmla z13.s, z7.s, z1.s[3]\n"
-      "fmla z17.s, z7.s, z2.s[3]\n"
-      "fmla z21.s, z7.s, z3.s[3]\n"
-      "fmla z25.s, z7.s, z4.s[3]\n"
-      "fmla z29.s, z7.s, z5.s[3]\n"
-      "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z7.s }, p5/Z, [x10]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n"
+      "fmla z8.s, z7.s, z0.s[3]\n"
+      "fmla z12.s, z7.s, z1.s[3]\n"
+      "fmla z16.s, z7.s, z2.s[3]\n"
+      "fmla z20.s, z7.s, z3.s[3]\n"
+      "fmla z24.s, z7.s, z4.s[3]\n"
+      "fmla z28.s, z7.s, z5.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z9.s, z6.s, z0.s[3]\n"
+      "fmla z13.s, z6.s, z1.s[3]\n"
+      "fmla z17.s, z6.s, z2.s[3]\n"
+      "fmla z21.s, z6.s, z3.s[3]\n"
+      "fmla z25.s, z6.s, z4.s[3]\n"
+      "fmla z29.s, z6.s, z5.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "fmla z10.s, z6.s, z0.s[3]\n"
-      "fmla z14.s, z6.s, z1.s[3]\n"
-      "fmla z18.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[3]\n"
-      "fmla z26.s, z6.s, z4.s[3]\n"
-      "fmla z30.s, z6.s, z5.s[3]\n"
-      "fmla z11.s, z7.s, z0.s[3]\n"
-      "fmla z15.s, z7.s, z1.s[3]\n"
-      "fmla z19.s, z7.s, z2.s[3]\n"
-      "fmla z23.s, z7.s, z3.s[3]\n"
-      "fmla z27.s, z7.s, z4.s[3]\n"
-      "fmla z31.s, z7.s, z5.s[3]\n"
+      "fmla z10.s, z7.s, z0.s[3]\n"
+      "fmla z14.s, z7.s, z1.s[3]\n"
+      "fmla z18.s, z7.s, z2.s[3]\n"
+      "fmla z22.s, z7.s, z3.s[3]\n"
+      "fmla z26.s, z7.s, z4.s[3]\n"
+      "fmla z30.s, z7.s, z5.s[3]\n"
+      "fmla z11.s, z6.s, z0.s[3]\n"
+      "fmla z15.s, z6.s, z1.s[3]\n"
+      "fmla z19.s, z6.s, z2.s[3]\n"
+      "fmla z23.s, z6.s, z3.s[3]\n"
+      "fmla z27.s, z6.s, z4.s[3]\n"
+      "fmla z31.s, z6.s, z5.s[3]\n"
       "76:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -2081,7 +2081,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "80:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -2089,4 +2088,4 @@
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
index c0718b1..a353c9d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 
 #define ARGLIST  \
@@ -89,5 +89,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp
index 2ccd050..3443412 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp
@@ -127,11 +127,11 @@
       "6:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 7f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
       "cbnz x10, 8f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -143,19 +143,19 @@
       "ld1rw { z0.s }, p1/Z, [x28]\n"
       "ble 10f\n"
       "9:"  // Height 1: Multiply loop: Main loop
-      "ld1w { z8.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x28, x28, #0x4\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
       "addvl x12, x12, #1\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
       "bgt 9b\n"
       "10:"  // Height 1: Multiply loop: Main loop skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "ld1w { z9.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x10, x10, #0x1\n"
       "cmp x10, x20\n"
-      "fmla z24.s, p1/M, z9.s, z0.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
       "addvl x12, x12, #1\n"
       "bne 6b\n"
       "tbz %x[flags], #1, 11f\n"
@@ -189,9 +189,9 @@
       "15:"  // Height 2: no bias
       "tbz %x[flags], #0, 16f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x27, x11, x20, LSL #2\n"
+      "add x20, x11, x20, LSL #2\n"
       "ld1w { z24.s }, p0/Z, [x11]\n"
-      "ld1w { z25.s }, p0/Z, [x27]\n"
+      "ld1w { z25.s }, p0/Z, [x20]\n"
       "b 17f\n"
       "16:"  // Height 2: no accumulate
       "mov z24.b, #0x0\n"
@@ -201,12 +201,12 @@
       "18:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 19f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
       "cbnz x10, 20f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -214,30 +214,30 @@
       "b 20f\n"
       "19:"  // Height 2: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
       "20:"  // Height 2: input setup done
       "subs x9, x9, #0x1\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
       "ld1rw { z1.s }, p1/Z, [x27]\n"
       "ble 22f\n"
       "21:"  // Height 2: Multiply loop: Main loop
-      "ld1w { z8.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x28, x28, #0x4\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
       "add x27, x27, #0x4\n"
-      "fmla z25.s, p1/M, z8.s, z1.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
       "addvl x12, x12, #1\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
       "ld1rw { z1.s }, p1/Z, [x27]\n"
       "bgt 21b\n"
       "22:"  // Height 2: Multiply loop: Main loop skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "ld1w { z9.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x10, x10, #0x1\n"
       "cmp x10, x20\n"
-      "fmla z24.s, p1/M, z9.s, z0.s\n"
-      "fmla z25.s, p1/M, z9.s, z1.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
       "addvl x12, x12, #1\n"
       "bne 18b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -277,11 +277,11 @@
       "27:"  // Height 3: no bias
       "tbz %x[flags], #0, 28f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x27, x11, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "add x21, x11, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z24.s }, p0/Z, [x11]\n"
-      "ld1w { z25.s }, p0/Z, [x27]\n"
-      "ld1w { z26.s }, p0/Z, [x26]\n"
+      "ld1w { z25.s }, p0/Z, [x21]\n"
+      "ld1w { z26.s }, p0/Z, [x20]\n"
       "b 29f\n"
       "28:"  // Height 3: no accumulate
       "mov z24.b, #0x0\n"
@@ -292,13 +292,13 @@
       "30:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 31f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
       "cbnz x10, 32f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -307,8 +307,8 @@
       "b 32f\n"
       "31:"  // Height 3: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
       "32:"  // Height 3: input setup done
       "subs x9, x9, #0x1\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
@@ -316,14 +316,14 @@
       "ld1rw { z2.s }, p1/Z, [x26]\n"
       "ble 34f\n"
       "33:"  // Height 3: Multiply loop: Main loop
-      "ld1w { z8.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x28, x28, #0x4\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
       "add x27, x27, #0x4\n"
       "add x26, x26, #0x4\n"
-      "fmla z25.s, p1/M, z8.s, z1.s\n"
-      "fmla z26.s, p1/M, z8.s, z2.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
       "addvl x12, x12, #1\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
       "ld1rw { z1.s }, p1/Z, [x27]\n"
@@ -331,13 +331,13 @@
       "bgt 33b\n"
       "34:"  // Height 3: Multiply loop: Main loop skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "ld1w { z9.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x10, x10, #0x1\n"
       "cmp x10, x20\n"
-      "fmla z24.s, p1/M, z9.s, z0.s\n"
-      "fmla z25.s, p1/M, z9.s, z1.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, p1/M, z9.s, z2.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
       "bne 30b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x27, x11, x20, LSL #2\n"
@@ -381,13 +381,13 @@
       "39:"  // Height 4: no bias
       "tbz %x[flags], #0, 40f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x27, x11, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "add x22, x11, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z24.s }, p0/Z, [x11]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "ld1w { z25.s }, p0/Z, [x27]\n"
-      "ld1w { z26.s }, p0/Z, [x26]\n"
-      "ld1w { z27.s }, p0/Z, [x25]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z25.s }, p0/Z, [x22]\n"
+      "ld1w { z26.s }, p0/Z, [x21]\n"
+      "ld1w { z27.s }, p0/Z, [x20]\n"
       "b 41f\n"
       "40:"  // Height 4: no accumulate
       "mov z24.b, #0x0\n"
@@ -399,14 +399,14 @@
       "42:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 43f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
       "cbnz x10, 44f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -416,9 +416,9 @@
       "b 44f\n"
       "43:"  // Height 4: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
       "44:"  // Height 4: input setup done
       "subs x9, x9, #0x1\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
@@ -427,16 +427,16 @@
       "ld1rw { z3.s }, p1/Z, [x25]\n"
       "ble 46f\n"
       "45:"  // Height 4: Multiply loop: Main loop
-      "ld1w { z8.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x28, x28, #0x4\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
       "add x27, x27, #0x4\n"
       "add x26, x26, #0x4\n"
-      "fmla z25.s, p1/M, z8.s, z1.s\n"
-      "fmla z26.s, p1/M, z8.s, z2.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
       "add x25, x25, #0x4\n"
-      "fmla z27.s, p1/M, z8.s, z3.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
       "addvl x12, x12, #1\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
       "ld1rw { z1.s }, p1/Z, [x27]\n"
@@ -445,14 +445,14 @@
       "bgt 45b\n"
       "46:"  // Height 4: Multiply loop: Main loop skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "ld1w { z9.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x10, x10, #0x1\n"
       "cmp x10, x20\n"
-      "fmla z24.s, p1/M, z9.s, z0.s\n"
-      "fmla z25.s, p1/M, z9.s, z1.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, p1/M, z9.s, z2.s\n"
-      "fmla z27.s, p1/M, z9.s, z3.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
       "bne 42b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x27, x11, x20, LSL #2\n"
@@ -501,15 +501,15 @@
       "51:"  // Height 5: no bias
       "tbz %x[flags], #0, 52f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x27, x11, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "add x23, x11, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "ld1w { z24.s }, p0/Z, [x11]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z25.s }, p0/Z, [x27]\n"
-      "ld1w { z26.s }, p0/Z, [x26]\n"
-      "ld1w { z27.s }, p0/Z, [x25]\n"
-      "ld1w { z28.s }, p0/Z, [x24]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z25.s }, p0/Z, [x23]\n"
+      "ld1w { z26.s }, p0/Z, [x22]\n"
+      "ld1w { z27.s }, p0/Z, [x21]\n"
+      "ld1w { z28.s }, p0/Z, [x20]\n"
       "b 53f\n"
       "52:"  // Height 5: no accumulate
       "mov z24.b, #0x0\n"
@@ -522,15 +522,15 @@
       "54:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 55f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x24, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
       "cbnz x10, 56f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -541,10 +541,10 @@
       "b 56f\n"
       "55:"  // Height 5: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "56:"  // Height 5: input setup done
       "subs x9, x9, #0x1\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
@@ -554,20 +554,20 @@
       "ld1rw { z4.s }, p1/Z, [x24]\n"
       "ble 58f\n"
       "57:"  // Height 5: Multiply loop: Main loop
-      "ld1w { z8.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x28, x28, #0x4\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
       "add x27, x27, #0x4\n"
       "add x26, x26, #0x4\n"
-      "fmla z25.s, p1/M, z8.s, z1.s\n"
-      "fmla z26.s, p1/M, z8.s, z2.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
       "add x25, x25, #0x4\n"
       "add x24, x24, #0x4\n"
-      "fmla z27.s, p1/M, z8.s, z3.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
       "addvl x12, x12, #1\n"
-      "fmla z28.s, p1/M, z8.s, z4.s\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
       "ld1rw { z1.s }, p1/Z, [x27]\n"
       "ld1rw { z2.s }, p1/Z, [x26]\n"
       "ld1rw { z3.s }, p1/Z, [x25]\n"
@@ -575,15 +575,15 @@
       "bgt 57b\n"
       "58:"  // Height 5: Multiply loop: Main loop skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "ld1w { z9.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x10, x10, #0x1\n"
       "cmp x10, x20\n"
-      "fmla z24.s, p1/M, z9.s, z0.s\n"
-      "fmla z25.s, p1/M, z9.s, z1.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, p1/M, z9.s, z2.s\n"
-      "fmla z27.s, p1/M, z9.s, z3.s\n"
-      "fmla z28.s, p1/M, z9.s, z4.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
       "bne 54b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x27, x11, x20, LSL #2\n"
@@ -636,18 +636,18 @@
       "b 65f\n"
       "63:"  // Height 6: no bias
       "tbz %x[flags], #0, 64f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x27, x11, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x11, x24, LSL #2\n"
+      "add x20, x23, x24, LSL #2\n"
       "ld1w { z24.s }, p0/Z, [x11]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z25.s }, p0/Z, [x27]\n"
-      "ld1w { z26.s }, p0/Z, [x26]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z27.s }, p0/Z, [x25]\n"
-      "ld1w { z28.s }, p0/Z, [x24]\n"
-      "ld1w { z29.s }, p0/Z, [x23]\n"
+      "add x22, x20, x24, LSL #2\n"
+      "add x21, x22, x24, LSL #2\n"
+      "ld1w { z25.s }, p0/Z, [x23]\n"
+      "ld1w { z26.s }, p0/Z, [x20]\n"
+      "add x20, x21, x24, LSL #2\n"
+      "ld1w { z27.s }, p0/Z, [x22]\n"
+      "ld1w { z28.s }, p0/Z, [x21]\n"
+      "ld1w { z29.s }, p0/Z, [x20]\n"
       "b 65f\n"
       "64:"  // Height 6: no accumulate
       "mov z24.b, #0x0\n"
@@ -661,16 +661,16 @@
       "66:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 67f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x24, [x21, #0x20]\n"
-      "ldr x23, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
       "cbnz x10, 68f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -682,11 +682,11 @@
       "b 68f\n"
       "67:"  // Height 6: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "68:"  // Height 6: input setup done
       "subs x9, x9, #0x1\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
@@ -697,21 +697,21 @@
       "ld1rw { z5.s }, p1/Z, [x23]\n"
       "ble 70f\n"
       "69:"  // Height 6: Multiply loop: Main loop
-      "ld1w { z8.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x28, x28, #0x4\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
       "add x27, x27, #0x4\n"
       "add x26, x26, #0x4\n"
-      "fmla z25.s, p1/M, z8.s, z1.s\n"
-      "fmla z26.s, p1/M, z8.s, z2.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
       "add x25, x25, #0x4\n"
       "add x24, x24, #0x4\n"
-      "fmla z27.s, p1/M, z8.s, z3.s\n"
-      "fmla z28.s, p1/M, z8.s, z4.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
       "add x23, x23, #0x4\n"
       "addvl x12, x12, #1\n"
-      "fmla z29.s, p1/M, z8.s, z5.s\n"
+      "fmla z29.s, p1/M, z16.s, z5.s\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
       "ld1rw { z1.s }, p1/Z, [x27]\n"
       "ld1rw { z2.s }, p1/Z, [x26]\n"
@@ -721,16 +721,16 @@
       "bgt 69b\n"
       "70:"  // Height 6: Multiply loop: Main loop skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "ld1w { z9.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x10, x10, #0x1\n"
       "cmp x10, x20\n"
-      "fmla z24.s, p1/M, z9.s, z0.s\n"
-      "fmla z25.s, p1/M, z9.s, z1.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, p1/M, z9.s, z2.s\n"
-      "fmla z27.s, p1/M, z9.s, z3.s\n"
-      "fmla z28.s, p1/M, z9.s, z4.s\n"
-      "fmla z29.s, p1/M, z9.s, z5.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
+      "fmla z29.s, p1/M, z16.s, z5.s\n"
       "bne 66b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x27, x11, x20, LSL #2\n"
@@ -788,20 +788,20 @@
       "b 77f\n"
       "75:"  // Height 7: no bias
       "tbz %x[flags], #0, 76f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x27, x11, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x11, x24, LSL #2\n"
+      "add x20, x21, x24, LSL #2\n"
       "ld1w { z24.s }, p0/Z, [x11]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z25.s }, p0/Z, [x27]\n"
-      "ld1w { z26.s }, p0/Z, [x26]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "ld1w { z27.s }, p0/Z, [x25]\n"
-      "ld1w { z28.s }, p0/Z, [x24]\n"
-      "ld1w { z29.s }, p0/Z, [x23]\n"
-      "ld1w { z30.s }, p0/Z, [x22]\n"
+      "add x23, x20, x24, LSL #2\n"
+      "add x22, x23, x24, LSL #2\n"
+      "ld1w { z25.s }, p0/Z, [x21]\n"
+      "ld1w { z26.s }, p0/Z, [x20]\n"
+      "add x21, x22, x24, LSL #2\n"
+      "add x20, x21, x24, LSL #2\n"
+      "ld1w { z27.s }, p0/Z, [x23]\n"
+      "ld1w { z28.s }, p0/Z, [x22]\n"
+      "ld1w { z29.s }, p0/Z, [x21]\n"
+      "ld1w { z30.s }, p0/Z, [x20]\n"
       "b 77f\n"
       "76:"  // Height 7: no accumulate
       "mov z24.b, #0x0\n"
@@ -816,17 +816,17 @@
       "78:"  // Height 7: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 79f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x24, [x21, #0x20]\n"
-      "ldr x23, [x21, #0x28]\n"
-      "ldr x22, [x21, #0x30]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
       "cbnz x10, 80f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -839,12 +839,12 @@
       "b 80f\n"
       "79:"  // Height 7: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "80:"  // Height 7: input setup done
       "subs x9, x9, #0x1\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
@@ -856,25 +856,25 @@
       "ld1rw { z6.s }, p1/Z, [x22]\n"
       "ble 82f\n"
       "81:"  // Height 7: Multiply loop: Main loop
-      "ld1w { z8.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x28, x28, #0x4\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
       "add x27, x27, #0x4\n"
       "add x26, x26, #0x4\n"
-      "fmla z25.s, p1/M, z8.s, z1.s\n"
-      "fmla z26.s, p1/M, z8.s, z2.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
       "add x25, x25, #0x4\n"
       "add x24, x24, #0x4\n"
-      "fmla z27.s, p1/M, z8.s, z3.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
       "add x23, x23, #0x4\n"
       "add x22, x22, #0x4\n"
-      "fmla z28.s, p1/M, z8.s, z4.s\n"
-      "fmla z29.s, p1/M, z8.s, z5.s\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
+      "fmla z29.s, p1/M, z16.s, z5.s\n"
       "addvl x12, x12, #1\n"
       "ld1rw { z1.s }, p1/Z, [x27]\n"
-      "fmla z30.s, p1/M, z8.s, z6.s\n"
+      "fmla z30.s, p1/M, z16.s, z6.s\n"
       "ld1rw { z2.s }, p1/Z, [x26]\n"
       "ld1rw { z3.s }, p1/Z, [x25]\n"
       "ld1rw { z4.s }, p1/Z, [x24]\n"
@@ -883,17 +883,17 @@
       "bgt 81b\n"
       "82:"  // Height 7: Multiply loop: Main loop skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "ld1w { z9.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x10, x10, #0x1\n"
       "cmp x10, x20\n"
-      "fmla z24.s, p1/M, z9.s, z0.s\n"
-      "fmla z25.s, p1/M, z9.s, z1.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, p1/M, z9.s, z2.s\n"
-      "fmla z27.s, p1/M, z9.s, z3.s\n"
-      "fmla z28.s, p1/M, z9.s, z4.s\n"
-      "fmla z29.s, p1/M, z9.s, z5.s\n"
-      "fmla z30.s, p1/M, z9.s, z6.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
+      "fmla z29.s, p1/M, z16.s, z5.s\n"
+      "fmla z30.s, p1/M, z16.s, z6.s\n"
       "bne 78b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x27, x11, x20, LSL #2\n"
@@ -959,22 +959,22 @@
       "b 89f\n"
       "87:"  // Height 8: no bias
       "tbz %x[flags], #0, 88f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x27, x11, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x11, x24, LSL #2\n"
+      "add x21, x22, x24, LSL #2\n"
       "ld1w { z24.s }, p0/Z, [x11]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z25.s }, p0/Z, [x27]\n"
-      "ld1w { z26.s }, p0/Z, [x26]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "ld1w { z27.s }, p0/Z, [x25]\n"
-      "ld1w { z28.s }, p0/Z, [x24]\n"
-      "add x21, x22, x20, LSL #2\n"
-      "ld1w { z29.s }, p0/Z, [x23]\n"
-      "ld1w { z30.s }, p0/Z, [x22]\n"
-      "ld1w { z31.s }, p0/Z, [x21]\n"
+      "add x23, x21, x24, LSL #2\n"
+      "add x20, x23, x24, LSL #2\n"
+      "ld1w { z25.s }, p0/Z, [x22]\n"
+      "ld1w { z26.s }, p0/Z, [x21]\n"
+      "add x22, x20, x24, LSL #2\n"
+      "add x21, x22, x24, LSL #2\n"
+      "ld1w { z27.s }, p0/Z, [x23]\n"
+      "ld1w { z28.s }, p0/Z, [x20]\n"
+      "add x20, x21, x24, LSL #2\n"
+      "ld1w { z29.s }, p0/Z, [x22]\n"
+      "ld1w { z30.s }, p0/Z, [x21]\n"
+      "ld1w { z31.s }, p0/Z, [x20]\n"
       "b 89f\n"
       "88:"  // Height 8: no accumulate
       "mov z24.b, #0x0\n"
@@ -990,18 +990,18 @@
       "90:"  // Height 8: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 91f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x24, [x21, #0x20]\n"
-      "ldr x23, [x21, #0x28]\n"
-      "ldr x22, [x21, #0x30]\n"
-      "ldr x21, [x21, #0x38]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "ldr x21, [x20, #0x38]\n"
       "cbnz x10, 92f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -1015,13 +1015,13 @@
       "b 92f\n"
       "91:"  // Height 8: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "92:"  // Height 8: input setup done
       "subs x9, x9, #0x1\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
@@ -1034,27 +1034,27 @@
       "ld1rw { z7.s }, p1/Z, [x21]\n"
       "ble 94f\n"
       "93:"  // Height 8: Multiply loop: Main loop
-      "ld1w { z8.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x28, x28, #0x4\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
       "add x27, x27, #0x4\n"
       "add x26, x26, #0x4\n"
-      "fmla z25.s, p1/M, z8.s, z1.s\n"
-      "fmla z26.s, p1/M, z8.s, z2.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
       "add x25, x25, #0x4\n"
       "add x24, x24, #0x4\n"
-      "fmla z27.s, p1/M, z8.s, z3.s\n"
-      "fmla z28.s, p1/M, z8.s, z4.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
       "add x23, x23, #0x4\n"
       "add x22, x22, #0x4\n"
-      "fmla z29.s, p1/M, z8.s, z5.s\n"
+      "fmla z29.s, p1/M, z16.s, z5.s\n"
       "ld1rw { z0.s }, p1/Z, [x28]\n"
       "add x21, x21, #0x4\n"
       "addvl x12, x12, #1\n"
       "ld1rw { z1.s }, p1/Z, [x27]\n"
-      "fmla z30.s, p1/M, z8.s, z6.s\n"
-      "fmla z31.s, p1/M, z8.s, z7.s\n"
+      "fmla z30.s, p1/M, z16.s, z6.s\n"
+      "fmla z31.s, p1/M, z16.s, z7.s\n"
       "ld1rw { z2.s }, p1/Z, [x26]\n"
       "ld1rw { z3.s }, p1/Z, [x25]\n"
       "ld1rw { z4.s }, p1/Z, [x24]\n"
@@ -1064,18 +1064,18 @@
       "bgt 93b\n"
       "94:"  // Height 8: Multiply loop: Main loop skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-      "ld1w { z9.s }, p1/Z, [x12]\n"
+      "ld1w { z16.s }, p1/Z, [x12]\n"
       "add x10, x10, #0x1\n"
       "cmp x10, x20\n"
-      "fmla z24.s, p1/M, z9.s, z0.s\n"
-      "fmla z25.s, p1/M, z9.s, z1.s\n"
+      "fmla z24.s, p1/M, z16.s, z0.s\n"
+      "fmla z25.s, p1/M, z16.s, z1.s\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, p1/M, z9.s, z2.s\n"
-      "fmla z27.s, p1/M, z9.s, z3.s\n"
-      "fmla z28.s, p1/M, z9.s, z4.s\n"
-      "fmla z29.s, p1/M, z9.s, z5.s\n"
-      "fmla z30.s, p1/M, z9.s, z6.s\n"
-      "fmla z31.s, p1/M, z9.s, z7.s\n"
+      "fmla z26.s, p1/M, z16.s, z2.s\n"
+      "fmla z27.s, p1/M, z16.s, z3.s\n"
+      "fmla z28.s, p1/M, z16.s, z4.s\n"
+      "fmla z29.s, p1/M, z16.s, z5.s\n"
+      "fmla z30.s, p1/M, z16.s, z6.s\n"
+      "fmla z31.s, p1/M, z16.s, z7.s\n"
       "bne 90b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x27, x11, x20, LSL #2\n"
@@ -1132,12 +1132,11 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "98:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
index 9679d49..161c85e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -127,11 +127,11 @@
       "6:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 7f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
       "cbnz x10, 8f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -144,39 +144,39 @@
       "9:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x9\n"
       "ld1rqw { z0.s }, p0/Z, [x28]\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
       "sub x9, x9, #0x4\n"
       "cmp x9, #0x4\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
       "add x28, x28, #0x10\n"
       "addvl x12, x12, #4\n"
       "bgt 9b\n"
       "10:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x9\n"
       "ld1rqw { z0.s }, p0/Z, [x28]\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
       "addvl x12, x12, #1\n"
       "ble 11f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
       "addvl x12, x12, #1\n"
       "ble 11f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
       "addvl x12, x12, #1\n"
       "ble 11f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
       "addvl x12, x12, #1\n"
       "11:"  // Height 1: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -214,9 +214,9 @@
       "16:"  // Height 2: no bias
       "tbz %x[flags], #0, 17f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x27, x11, x20, LSL #2\n"
+      "add x20, x11, x20, LSL #2\n"
       "ld1w { z24.s }, p1/Z, [x11]\n"
-      "ld1w { z25.s }, p1/Z, [x27]\n"
+      "ld1w { z25.s }, p1/Z, [x20]\n"
       "b 18f\n"
       "17:"  // Height 2: no accumulate
       "mov z24.b, #0x0\n"
@@ -226,12 +226,12 @@
       "19:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
       "cbnz x10, 21f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -239,29 +239,29 @@
       "b 21f\n"
       "20:"  // Height 2: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
       "21:"  // Height 2: input setup done
       "cmp x9, #0x4\n"
       "ble 23f\n"
       "22:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x9\n"
-      "ld1rqw { z0.s }, p0/Z, [x28]\n"
-      "ld1rqw { z1.s }, p0/Z, [x27]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "sub x9, x9, #0x4\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z1.s[0]\n"
+      "fmla z25.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "fmla z24.s, z16.s, z1.s[1]\n"
+      "fmla z25.s, z16.s, z0.s[1]\n"
+      "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "fmla z24.s, z17.s, z1.s[2]\n"
+      "fmla z25.s, z17.s, z0.s[2]\n"
       "cmp x9, #0x4\n"
       "add x28, x28, #0x10\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
+      "fmla z24.s, z16.s, z1.s[3]\n"
+      "fmla z25.s, z16.s, z0.s[3]\n"
       "add x27, x27, #0x10\n"
       "addvl x12, x12, #4\n"
       "bgt 22b\n"
@@ -270,26 +270,26 @@
       "ld1rqw { z0.s }, p0/Z, [x28]\n"
       "ld1rqw { z1.s }, p0/Z, [x27]\n"
       "subs x9, x9, #0x1\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
       "addvl x12, x12, #1\n"
       "ble 24f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
       "addvl x12, x12, #1\n"
       "ble 24f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "fmla z25.s, z16.s, z1.s[2]\n"
       "addvl x12, x12, #1\n"
       "ble 24f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
       "24:"  // Height 2: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -333,11 +333,11 @@
       "29:"  // Height 3: no bias
       "tbz %x[flags], #0, 30f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x27, x11, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "add x21, x11, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z24.s }, p1/Z, [x11]\n"
-      "ld1w { z25.s }, p1/Z, [x27]\n"
-      "ld1w { z26.s }, p1/Z, [x26]\n"
+      "ld1w { z25.s }, p1/Z, [x21]\n"
+      "ld1w { z26.s }, p1/Z, [x20]\n"
       "b 31f\n"
       "30:"  // Height 3: no accumulate
       "mov z24.b, #0x0\n"
@@ -348,13 +348,13 @@
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
       "cbnz x10, 34f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -363,38 +363,38 @@
       "b 34f\n"
       "33:"  // Height 3: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
       "34:"  // Height 3: input setup done
       "cmp x9, #0x4\n"
       "ble 36f\n"
       "35:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x9\n"
-      "ld1rqw { z0.s }, p0/Z, [x28]\n"
+      "ld1rqw { z2.s }, p0/Z, [x28]\n"
       "ld1rqw { z1.s }, p0/Z, [x27]\n"
       "sub x9, x9, #0x4\n"
-      "ld1rqw { z2.s }, p0/Z, [x26]\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "ld1rqw { z0.s }, p0/Z, [x26]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z2.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
+      "fmla z26.s, z16.s, z0.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "fmla z24.s, z16.s, z2.s[1]\n"
+      "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
+      "fmla z26.s, z16.s, z0.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
       "cmp x9, #0x4\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
+      "fmla z24.s, z17.s, z2.s[2]\n"
+      "fmla z25.s, z17.s, z1.s[2]\n"
       "add x28, x28, #0x10\n"
       "add x27, x27, #0x10\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z26.s, z17.s, z0.s[2]\n"
+      "fmla z24.s, z16.s, z2.s[3]\n"
       "add x26, x26, #0x10\n"
       "addvl x12, x12, #4\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
+      "fmla z26.s, z16.s, z0.s[3]\n"
       "bgt 35b\n"
       "36:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x9\n"
@@ -402,31 +402,31 @@
       "ld1rqw { z1.s }, p0/Z, [x27]\n"
       "subs x9, x9, #0x1\n"
       "ld1rqw { z2.s }, p0/Z, [x26]\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
+      "fmla z26.s, z16.s, z2.s[0]\n"
       "addvl x12, x12, #1\n"
       "ble 37f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
+      "fmla z26.s, z16.s, z2.s[1]\n"
       "addvl x12, x12, #1\n"
       "ble 37f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "fmla z25.s, z16.s, z1.s[2]\n"
+      "fmla z26.s, z16.s, z2.s[2]\n"
       "addvl x12, x12, #1\n"
       "ble 37f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
+      "fmla z26.s, z16.s, z2.s[3]\n"
       "37:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x10, x10, #0x1\n"
@@ -474,13 +474,13 @@
       "42:"  // Height 4: no bias
       "tbz %x[flags], #0, 43f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x27, x11, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "add x22, x11, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z24.s }, p1/Z, [x11]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "ld1w { z25.s }, p1/Z, [x27]\n"
-      "ld1w { z26.s }, p1/Z, [x26]\n"
-      "ld1w { z27.s }, p1/Z, [x25]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z25.s }, p1/Z, [x22]\n"
+      "ld1w { z26.s }, p1/Z, [x21]\n"
+      "ld1w { z27.s }, p1/Z, [x20]\n"
       "b 44f\n"
       "43:"  // Height 4: no accumulate
       "mov z24.b, #0x0\n"
@@ -492,14 +492,14 @@
       "45:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 46f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
       "cbnz x10, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -509,45 +509,45 @@
       "b 47f\n"
       "46:"  // Height 4: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
       "47:"  // Height 4: input setup done
       "cmp x9, #0x4\n"
       "ble 49f\n"
       "48:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x9\n"
-      "ld1rqw { z0.s }, p0/Z, [x28]\n"
-      "ld1rqw { z1.s }, p0/Z, [x27]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "ld1rqw { z2.s }, p0/Z, [x27]\n"
       "sub x9, x9, #0x4\n"
-      "ld1rqw { z2.s }, p0/Z, [x26]\n"
-      "ld1rqw { z3.s }, p0/Z, [x25]\n"
+      "ld1rqw { z1.s }, p0/Z, [x26]\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "cmp x9, #0x4\n"
       "add x28, x28, #0x10\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z3.s[0]\n"
+      "fmla z25.s, z16.s, z2.s[0]\n"
+      "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "fmla z26.s, z16.s, z1.s[0]\n"
+      "fmla z27.s, z16.s, z0.s[0]\n"
+      "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "fmla z24.s, z18.s, z3.s[1]\n"
+      "fmla z25.s, z18.s, z2.s[1]\n"
       "add x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
+      "fmla z26.s, z18.s, z1.s[1]\n"
+      "fmla z27.s, z18.s, z0.s[1]\n"
       "add x25, x25, #0x10\n"
       "addvl x12, x12, #4\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
+      "fmla z24.s, z17.s, z3.s[2]\n"
+      "fmla z25.s, z17.s, z2.s[2]\n"
+      "fmla z26.s, z17.s, z1.s[2]\n"
+      "fmla z27.s, z17.s, z0.s[2]\n"
+      "fmla z24.s, z16.s, z3.s[3]\n"
+      "fmla z25.s, z16.s, z2.s[3]\n"
+      "fmla z26.s, z16.s, z1.s[3]\n"
+      "fmla z27.s, z16.s, z0.s[3]\n"
       "bgt 48b\n"
       "49:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x9\n"
@@ -556,35 +556,35 @@
       "subs x9, x9, #0x1\n"
       "ld1rqw { z2.s }, p0/Z, [x26]\n"
       "ld1rqw { z3.s }, p0/Z, [x25]\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
+      "fmla z26.s, z16.s, z2.s[0]\n"
+      "fmla z27.s, z16.s, z3.s[0]\n"
       "ble 50f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
+      "fmla z26.s, z16.s, z2.s[1]\n"
+      "fmla z27.s, z16.s, z3.s[1]\n"
       "addvl x12, x12, #1\n"
       "ble 50f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "fmla z25.s, z16.s, z1.s[2]\n"
+      "fmla z26.s, z16.s, z2.s[2]\n"
+      "fmla z27.s, z16.s, z3.s[2]\n"
       "addvl x12, x12, #1\n"
       "ble 50f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
+      "fmla z26.s, z16.s, z2.s[3]\n"
+      "fmla z27.s, z16.s, z3.s[3]\n"
       "50:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x10, x10, #0x1\n"
@@ -637,15 +637,15 @@
       "55:"  // Height 5: no bias
       "tbz %x[flags], #0, 56f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x27, x11, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "add x23, x11, x20, LSL #2\n"
+      "add x22, x23, x20, LSL #2\n"
       "ld1w { z24.s }, p1/Z, [x11]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z25.s }, p1/Z, [x27]\n"
-      "ld1w { z26.s }, p1/Z, [x26]\n"
-      "ld1w { z27.s }, p1/Z, [x25]\n"
-      "ld1w { z28.s }, p1/Z, [x24]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z25.s }, p1/Z, [x23]\n"
+      "ld1w { z26.s }, p1/Z, [x22]\n"
+      "ld1w { z27.s }, p1/Z, [x21]\n"
+      "ld1w { z28.s }, p1/Z, [x20]\n"
       "b 57f\n"
       "56:"  // Height 5: no accumulate
       "mov z24.b, #0x0\n"
@@ -658,15 +658,15 @@
       "58:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 59f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x24, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
       "cbnz x10, 60f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -677,52 +677,52 @@
       "b 60f\n"
       "59:"  // Height 5: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "60:"  // Height 5: input setup done
       "cmp x9, #0x4\n"
       "ble 62f\n"
       "61:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x9\n"
-      "ld1rqw { z0.s }, p0/Z, [x28]\n"
-      "ld1rqw { z1.s }, p0/Z, [x27]\n"
+      "ld1rqw { z4.s }, p0/Z, [x28]\n"
+      "ld1rqw { z3.s }, p0/Z, [x27]\n"
       "sub x9, x9, #0x4\n"
       "ld1rqw { z2.s }, p0/Z, [x26]\n"
-      "ld1rqw { z3.s }, p0/Z, [x25]\n"
+      "ld1rqw { z1.s }, p0/Z, [x25]\n"
       "cmp x9, #0x4\n"
       "add x28, x28, #0x10\n"
-      "ld1rqw { z4.s }, p0/Z, [x24]\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "ld1rqw { z0.s }, p0/Z, [x24]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z4.s[0]\n"
+      "fmla z25.s, z16.s, z3.s[0]\n"
+      "fmla z26.s, z16.s, z2.s[0]\n"
+      "fmla z27.s, z16.s, z1.s[0]\n"
+      "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "fmla z28.s, z16.s, z0.s[0]\n"
+      "fmla z24.s, z18.s, z4.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
       "add x27, x27, #0x10\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
+      "fmla z25.s, z18.s, z3.s[1]\n"
+      "fmla z26.s, z18.s, z2.s[1]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
+      "fmla z27.s, z18.s, z1.s[1]\n"
+      "fmla z28.s, z18.s, z0.s[1]\n"
       "add x24, x24, #0x10\n"
       "addvl x12, x12, #4\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
+      "fmla z24.s, z17.s, z4.s[2]\n"
+      "fmla z25.s, z17.s, z3.s[2]\n"
+      "fmla z26.s, z17.s, z2.s[2]\n"
+      "fmla z27.s, z17.s, z1.s[2]\n"
+      "fmla z28.s, z17.s, z0.s[2]\n"
+      "fmla z24.s, z16.s, z4.s[3]\n"
+      "fmla z25.s, z16.s, z3.s[3]\n"
+      "fmla z26.s, z16.s, z2.s[3]\n"
+      "fmla z27.s, z16.s, z1.s[3]\n"
+      "fmla z28.s, z16.s, z0.s[3]\n"
       "bgt 61b\n"
       "62:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x9\n"
@@ -732,39 +732,39 @@
       "ld1rqw { z2.s }, p0/Z, [x26]\n"
       "ld1rqw { z3.s }, p0/Z, [x25]\n"
       "ld1rqw { z4.s }, p0/Z, [x24]\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
+      "fmla z26.s, z16.s, z2.s[0]\n"
+      "fmla z27.s, z16.s, z3.s[0]\n"
       "addvl x12, x12, #1\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
+      "fmla z28.s, z16.s, z4.s[0]\n"
       "ble 63f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
+      "fmla z26.s, z16.s, z2.s[1]\n"
+      "fmla z27.s, z16.s, z3.s[1]\n"
       "addvl x12, x12, #1\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
+      "fmla z28.s, z16.s, z4.s[1]\n"
       "ble 63f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "fmla z25.s, z16.s, z1.s[2]\n"
+      "fmla z26.s, z16.s, z2.s[2]\n"
+      "fmla z27.s, z16.s, z3.s[2]\n"
       "addvl x12, x12, #1\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
+      "fmla z28.s, z16.s, z4.s[2]\n"
       "ble 63f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
+      "fmla z26.s, z16.s, z2.s[3]\n"
+      "fmla z27.s, z16.s, z3.s[3]\n"
+      "fmla z28.s, z16.s, z4.s[3]\n"
       "63:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x10, x10, #0x1\n"
@@ -821,18 +821,18 @@
       "b 70f\n"
       "68:"  // Height 6: no bias
       "tbz %x[flags], #0, 69f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x27, x11, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x11, x24, LSL #2\n"
+      "add x20, x23, x24, LSL #2\n"
       "ld1w { z24.s }, p1/Z, [x11]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z25.s }, p1/Z, [x27]\n"
-      "ld1w { z26.s }, p1/Z, [x26]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z27.s }, p1/Z, [x25]\n"
-      "ld1w { z28.s }, p1/Z, [x24]\n"
-      "ld1w { z29.s }, p1/Z, [x23]\n"
+      "add x22, x20, x24, LSL #2\n"
+      "add x21, x22, x24, LSL #2\n"
+      "ld1w { z25.s }, p1/Z, [x23]\n"
+      "ld1w { z26.s }, p1/Z, [x20]\n"
+      "add x20, x21, x24, LSL #2\n"
+      "ld1w { z27.s }, p1/Z, [x22]\n"
+      "ld1w { z28.s }, p1/Z, [x21]\n"
+      "ld1w { z29.s }, p1/Z, [x20]\n"
       "b 70f\n"
       "69:"  // Height 6: no accumulate
       "mov z24.b, #0x0\n"
@@ -846,16 +846,16 @@
       "71:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 72f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x24, [x21, #0x20]\n"
-      "ldr x23, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
       "cbnz x10, 73f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -867,59 +867,59 @@
       "b 73f\n"
       "72:"  // Height 6: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "73:"  // Height 6: input setup done
       "cmp x9, #0x4\n"
       "ble 75f\n"
       "74:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x9\n"
-      "ld1rqw { z0.s }, p0/Z, [x28]\n"
-      "ld1rqw { z1.s }, p0/Z, [x27]\n"
+      "ld1rqw { z5.s }, p0/Z, [x28]\n"
+      "ld1rqw { z4.s }, p0/Z, [x27]\n"
       "sub x9, x9, #0x4\n"
-      "ld1rqw { z2.s }, p0/Z, [x26]\n"
-      "ld1rqw { z3.s }, p0/Z, [x25]\n"
+      "ld1rqw { z3.s }, p0/Z, [x26]\n"
+      "ld1rqw { z2.s }, p0/Z, [x25]\n"
       "cmp x9, #0x4\n"
       "add x28, x28, #0x10\n"
-      "ld1rqw { z4.s }, p0/Z, [x24]\n"
-      "ld1rqw { z5.s }, p0/Z, [x23]\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
       "add x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "fmla z29.s, z8.s, z5.s[0]\n"
+      "ld1w { z19.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z19.s, z5.s[0]\n"
+      "fmla z25.s, z19.s, z4.s[0]\n"
+      "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "fmla z26.s, z19.s, z3.s[0]\n"
+      "fmla z27.s, z19.s, z2.s[0]\n"
+      "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "fmla z28.s, z19.s, z1.s[0]\n"
+      "fmla z29.s, z19.s, z0.s[0]\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
+      "fmla z24.s, z18.s, z5.s[1]\n"
+      "fmla z25.s, z18.s, z4.s[1]\n"
       "add x23, x23, #0x10\n"
       "addvl x12, x12, #4\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
-      "fmla z29.s, z9.s, z5.s[1]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
-      "fmla z29.s, z10.s, z5.s[2]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
-      "fmla z29.s, z11.s, z5.s[3]\n"
+      "fmla z26.s, z18.s, z3.s[1]\n"
+      "fmla z27.s, z18.s, z2.s[1]\n"
+      "fmla z28.s, z18.s, z1.s[1]\n"
+      "fmla z29.s, z18.s, z0.s[1]\n"
+      "fmla z24.s, z17.s, z5.s[2]\n"
+      "fmla z25.s, z17.s, z4.s[2]\n"
+      "fmla z26.s, z17.s, z3.s[2]\n"
+      "fmla z27.s, z17.s, z2.s[2]\n"
+      "fmla z28.s, z17.s, z1.s[2]\n"
+      "fmla z29.s, z17.s, z0.s[2]\n"
+      "fmla z24.s, z16.s, z5.s[3]\n"
+      "fmla z25.s, z16.s, z4.s[3]\n"
+      "fmla z26.s, z16.s, z3.s[3]\n"
+      "fmla z27.s, z16.s, z2.s[3]\n"
+      "fmla z28.s, z16.s, z1.s[3]\n"
+      "fmla z29.s, z16.s, z0.s[3]\n"
       "bgt 74b\n"
       "75:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x9\n"
@@ -930,43 +930,43 @@
       "ld1rqw { z3.s }, p0/Z, [x25]\n"
       "ld1rqw { z4.s }, p0/Z, [x24]\n"
       "ld1rqw { z5.s }, p0/Z, [x23]\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "fmla z29.s, z8.s, z5.s[0]\n"
+      "fmla z26.s, z16.s, z2.s[0]\n"
+      "fmla z27.s, z16.s, z3.s[0]\n"
+      "fmla z28.s, z16.s, z4.s[0]\n"
+      "fmla z29.s, z16.s, z5.s[0]\n"
       "ble 76f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
+      "fmla z26.s, z16.s, z2.s[1]\n"
+      "fmla z27.s, z16.s, z3.s[1]\n"
       "addvl x12, x12, #1\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
-      "fmla z29.s, z9.s, z5.s[1]\n"
+      "fmla z28.s, z16.s, z4.s[1]\n"
+      "fmla z29.s, z16.s, z5.s[1]\n"
       "ble 76f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "fmla z25.s, z16.s, z1.s[2]\n"
+      "fmla z26.s, z16.s, z2.s[2]\n"
+      "fmla z27.s, z16.s, z3.s[2]\n"
       "addvl x12, x12, #1\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
-      "fmla z29.s, z10.s, z5.s[2]\n"
+      "fmla z28.s, z16.s, z4.s[2]\n"
+      "fmla z29.s, z16.s, z5.s[2]\n"
       "ble 76f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
-      "fmla z29.s, z11.s, z5.s[3]\n"
+      "fmla z26.s, z16.s, z2.s[3]\n"
+      "fmla z27.s, z16.s, z3.s[3]\n"
+      "fmla z28.s, z16.s, z4.s[3]\n"
+      "fmla z29.s, z16.s, z5.s[3]\n"
       "76:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x10, x10, #0x1\n"
@@ -1028,20 +1028,20 @@
       "b 83f\n"
       "81:"  // Height 7: no bias
       "tbz %x[flags], #0, 82f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x27, x11, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x21, x11, x24, LSL #2\n"
+      "add x20, x21, x24, LSL #2\n"
       "ld1w { z24.s }, p1/Z, [x11]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z25.s }, p1/Z, [x27]\n"
-      "ld1w { z26.s }, p1/Z, [x26]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "ld1w { z27.s }, p1/Z, [x25]\n"
-      "ld1w { z28.s }, p1/Z, [x24]\n"
-      "ld1w { z29.s }, p1/Z, [x23]\n"
-      "ld1w { z30.s }, p1/Z, [x22]\n"
+      "add x23, x20, x24, LSL #2\n"
+      "add x22, x23, x24, LSL #2\n"
+      "ld1w { z25.s }, p1/Z, [x21]\n"
+      "ld1w { z26.s }, p1/Z, [x20]\n"
+      "add x21, x22, x24, LSL #2\n"
+      "add x20, x21, x24, LSL #2\n"
+      "ld1w { z27.s }, p1/Z, [x23]\n"
+      "ld1w { z28.s }, p1/Z, [x22]\n"
+      "ld1w { z29.s }, p1/Z, [x21]\n"
+      "ld1w { z30.s }, p1/Z, [x20]\n"
       "b 83f\n"
       "82:"  // Height 7: no accumulate
       "mov z24.b, #0x0\n"
@@ -1056,17 +1056,17 @@
       "84:"  // Height 7: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 85f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x24, [x21, #0x20]\n"
-      "ldr x23, [x21, #0x28]\n"
-      "ldr x22, [x21, #0x30]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
       "cbnz x10, 86f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -1079,66 +1079,66 @@
       "b 86f\n"
       "85:"  // Height 7: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "86:"  // Height 7: input setup done
       "cmp x9, #0x4\n"
       "ble 88f\n"
       "87:"  // Height 7: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x9\n"
-      "ld1rqw { z0.s }, p0/Z, [x28]\n"
-      "ld1rqw { z1.s }, p0/Z, [x27]\n"
+      "ld1rqw { z6.s }, p0/Z, [x28]\n"
+      "ld1rqw { z5.s }, p0/Z, [x27]\n"
       "sub x9, x9, #0x4\n"
-      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
       "ld1rqw { z3.s }, p0/Z, [x25]\n"
       "cmp x9, #0x4\n"
       "add x28, x28, #0x10\n"
-      "ld1rqw { z4.s }, p0/Z, [x24]\n"
-      "ld1rqw { z5.s }, p0/Z, [x23]\n"
+      "ld1rqw { z2.s }, p0/Z, [x24]\n"
+      "ld1rqw { z1.s }, p0/Z, [x23]\n"
       "add x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "ld1rqw { z6.s }, p0/Z, [x22]\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "fmla z29.s, z8.s, z5.s[0]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "ld1rqw { z0.s }, p0/Z, [x22]\n"
+      "ld1w { z19.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z19.s, z6.s[0]\n"
+      "fmla z25.s, z19.s, z5.s[0]\n"
+      "fmla z26.s, z19.s, z4.s[0]\n"
+      "fmla z27.s, z19.s, z3.s[0]\n"
+      "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "fmla z28.s, z19.s, z2.s[0]\n"
+      "fmla z29.s, z19.s, z1.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "fmla z30.s, z8.s, z6.s[0]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
+      "fmla z30.s, z19.s, z0.s[0]\n"
+      "fmla z24.s, z18.s, z6.s[1]\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
+      "fmla z25.s, z18.s, z5.s[1]\n"
+      "fmla z26.s, z18.s, z4.s[1]\n"
       "add x22, x22, #0x10\n"
       "addvl x12, x12, #4\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
-      "fmla z29.s, z9.s, z5.s[1]\n"
-      "fmla z30.s, z9.s, z6.s[1]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
-      "fmla z29.s, z10.s, z5.s[2]\n"
-      "fmla z30.s, z10.s, z6.s[2]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
-      "fmla z29.s, z11.s, z5.s[3]\n"
-      "fmla z30.s, z11.s, z6.s[3]\n"
+      "fmla z27.s, z18.s, z3.s[1]\n"
+      "fmla z28.s, z18.s, z2.s[1]\n"
+      "fmla z29.s, z18.s, z1.s[1]\n"
+      "fmla z30.s, z18.s, z0.s[1]\n"
+      "fmla z24.s, z17.s, z6.s[2]\n"
+      "fmla z25.s, z17.s, z5.s[2]\n"
+      "fmla z26.s, z17.s, z4.s[2]\n"
+      "fmla z27.s, z17.s, z3.s[2]\n"
+      "fmla z28.s, z17.s, z2.s[2]\n"
+      "fmla z29.s, z17.s, z1.s[2]\n"
+      "fmla z30.s, z17.s, z0.s[2]\n"
+      "fmla z24.s, z16.s, z6.s[3]\n"
+      "fmla z25.s, z16.s, z5.s[3]\n"
+      "fmla z26.s, z16.s, z4.s[3]\n"
+      "fmla z27.s, z16.s, z3.s[3]\n"
+      "fmla z28.s, z16.s, z2.s[3]\n"
+      "fmla z29.s, z16.s, z1.s[3]\n"
+      "fmla z30.s, z16.s, z0.s[3]\n"
       "bgt 87b\n"
       "88:"  // Height 7: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x9\n"
@@ -1150,47 +1150,47 @@
       "ld1rqw { z4.s }, p0/Z, [x24]\n"
       "ld1rqw { z5.s }, p0/Z, [x23]\n"
       "ld1rqw { z6.s }, p0/Z, [x22]\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
+      "fmla z26.s, z16.s, z2.s[0]\n"
+      "fmla z27.s, z16.s, z3.s[0]\n"
       "addvl x12, x12, #1\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "fmla z29.s, z8.s, z5.s[0]\n"
-      "fmla z30.s, z8.s, z6.s[0]\n"
+      "fmla z28.s, z16.s, z4.s[0]\n"
+      "fmla z29.s, z16.s, z5.s[0]\n"
+      "fmla z30.s, z16.s, z6.s[0]\n"
       "ble 89f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
+      "fmla z26.s, z16.s, z2.s[1]\n"
+      "fmla z27.s, z16.s, z3.s[1]\n"
       "addvl x12, x12, #1\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
-      "fmla z29.s, z9.s, z5.s[1]\n"
-      "fmla z30.s, z9.s, z6.s[1]\n"
+      "fmla z28.s, z16.s, z4.s[1]\n"
+      "fmla z29.s, z16.s, z5.s[1]\n"
+      "fmla z30.s, z16.s, z6.s[1]\n"
       "ble 89f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "fmla z25.s, z16.s, z1.s[2]\n"
+      "fmla z26.s, z16.s, z2.s[2]\n"
+      "fmla z27.s, z16.s, z3.s[2]\n"
       "addvl x12, x12, #1\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
-      "fmla z29.s, z10.s, z5.s[2]\n"
-      "fmla z30.s, z10.s, z6.s[2]\n"
+      "fmla z28.s, z16.s, z4.s[2]\n"
+      "fmla z29.s, z16.s, z5.s[2]\n"
+      "fmla z30.s, z16.s, z6.s[2]\n"
       "ble 89f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
-      "fmla z29.s, z11.s, z5.s[3]\n"
-      "fmla z30.s, z11.s, z6.s[3]\n"
+      "fmla z26.s, z16.s, z2.s[3]\n"
+      "fmla z27.s, z16.s, z3.s[3]\n"
+      "fmla z28.s, z16.s, z4.s[3]\n"
+      "fmla z29.s, z16.s, z5.s[3]\n"
+      "fmla z30.s, z16.s, z6.s[3]\n"
       "89:"  // Height 7: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x10, x10, #0x1\n"
@@ -1260,22 +1260,22 @@
       "b 96f\n"
       "94:"  // Height 8: no bias
       "tbz %x[flags], #0, 95f\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x27, x11, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
+      "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x11, x24, LSL #2\n"
+      "add x21, x22, x24, LSL #2\n"
       "ld1w { z24.s }, p1/Z, [x11]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z25.s }, p1/Z, [x27]\n"
-      "ld1w { z26.s }, p1/Z, [x26]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "ld1w { z27.s }, p1/Z, [x25]\n"
-      "ld1w { z28.s }, p1/Z, [x24]\n"
-      "add x21, x22, x20, LSL #2\n"
-      "ld1w { z29.s }, p1/Z, [x23]\n"
-      "ld1w { z30.s }, p1/Z, [x22]\n"
-      "ld1w { z31.s }, p1/Z, [x21]\n"
+      "add x23, x21, x24, LSL #2\n"
+      "add x20, x23, x24, LSL #2\n"
+      "ld1w { z25.s }, p1/Z, [x22]\n"
+      "ld1w { z26.s }, p1/Z, [x21]\n"
+      "add x22, x20, x24, LSL #2\n"
+      "add x21, x22, x24, LSL #2\n"
+      "ld1w { z27.s }, p1/Z, [x23]\n"
+      "ld1w { z28.s }, p1/Z, [x20]\n"
+      "add x20, x21, x24, LSL #2\n"
+      "ld1w { z29.s }, p1/Z, [x22]\n"
+      "ld1w { z30.s }, p1/Z, [x21]\n"
+      "ld1w { z31.s }, p1/Z, [x20]\n"
       "b 96f\n"
       "95:"  // Height 8: no accumulate
       "mov z24.b, #0x0\n"
@@ -1291,18 +1291,18 @@
       "97:"  // Height 8: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w9, [x20, x10, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 98f\n"
-      "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x28, [x21, #0x0]\n"
-      "ldr x27, [x21, #0x8]\n"
-      "ldr x26, [x21, #0x10]\n"
-      "ldr x25, [x21, #0x18]\n"
-      "ldr x24, [x21, #0x20]\n"
-      "ldr x23, [x21, #0x28]\n"
-      "ldr x22, [x21, #0x30]\n"
-      "ldr x21, [x21, #0x38]\n"
+      "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x28, [x20, #0x0]\n"
+      "ldr x27, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x25, [x20, #0x18]\n"
+      "ldr x24, [x20, #0x20]\n"
+      "ldr x23, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "ldr x21, [x20, #0x38]\n"
       "cbnz x10, 99f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x28, x28, x20, LSL #2\n"
@@ -1316,73 +1316,73 @@
       "b 99f\n"
       "98:"  // Height 8: setup direct input
       "mov x28, %x[input_ptr]\n"
-      "add x27, x28, x20, LSL #2\n"
-      "add x26, x27, x20, LSL #2\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x27, x28, x21, LSL #2\n"
+      "add x26, x27, x21, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "99:"  // Height 8: input setup done
       "cmp x9, #0x4\n"
       "ble 101f\n"
       "100:"  // Height 8: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x9\n"
-      "ld1rqw { z0.s }, p0/Z, [x28]\n"
-      "ld1rqw { z1.s }, p0/Z, [x27]\n"
+      "ld1rqw { z7.s }, p0/Z, [x28]\n"
+      "ld1rqw { z6.s }, p0/Z, [x27]\n"
       "sub x9, x9, #0x4\n"
-      "ld1rqw { z2.s }, p0/Z, [x26]\n"
-      "ld1rqw { z3.s }, p0/Z, [x25]\n"
+      "ld1rqw { z5.s }, p0/Z, [x26]\n"
+      "ld1rqw { z4.s }, p0/Z, [x25]\n"
       "cmp x9, #0x4\n"
       "add x28, x28, #0x10\n"
-      "ld1rqw { z4.s }, p0/Z, [x24]\n"
-      "ld1rqw { z5.s }, p0/Z, [x23]\n"
+      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "ld1rqw { z2.s }, p0/Z, [x23]\n"
       "add x27, x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "ld1rqw { z6.s }, p0/Z, [x22]\n"
-      "ld1rqw { z7.s }, p0/Z, [x21]\n"
+      "ld1rqw { z1.s }, p0/Z, [x22]\n"
+      "ld1rqw { z0.s }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "fmla z29.s, z8.s, z5.s[0]\n"
+      "ld1w { z19.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z19.s, z7.s[0]\n"
+      "fmla z25.s, z19.s, z6.s[0]\n"
+      "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "fmla z26.s, z19.s, z5.s[0]\n"
+      "fmla z27.s, z19.s, z4.s[0]\n"
+      "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "fmla z28.s, z19.s, z3.s[0]\n"
+      "fmla z29.s, z19.s, z2.s[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "fmla z30.s, z8.s, z6.s[0]\n"
-      "fmla z31.s, z8.s, z7.s[0]\n"
+      "fmla z30.s, z19.s, z1.s[0]\n"
+      "fmla z31.s, z19.s, z0.s[0]\n"
       "add x21, x21, #0x10\n"
       "addvl x12, x12, #4\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
-      "fmla z29.s, z9.s, z5.s[1]\n"
-      "fmla z30.s, z9.s, z6.s[1]\n"
-      "fmla z31.s, z9.s, z7.s[1]\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
-      "fmla z29.s, z10.s, z5.s[2]\n"
-      "fmla z30.s, z10.s, z6.s[2]\n"
-      "fmla z31.s, z10.s, z7.s[2]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
-      "fmla z29.s, z11.s, z5.s[3]\n"
-      "fmla z30.s, z11.s, z6.s[3]\n"
-      "fmla z31.s, z11.s, z7.s[3]\n"
+      "fmla z24.s, z18.s, z7.s[1]\n"
+      "fmla z25.s, z18.s, z6.s[1]\n"
+      "fmla z26.s, z18.s, z5.s[1]\n"
+      "fmla z27.s, z18.s, z4.s[1]\n"
+      "fmla z28.s, z18.s, z3.s[1]\n"
+      "fmla z29.s, z18.s, z2.s[1]\n"
+      "fmla z30.s, z18.s, z1.s[1]\n"
+      "fmla z31.s, z18.s, z0.s[1]\n"
+      "fmla z24.s, z17.s, z7.s[2]\n"
+      "fmla z25.s, z17.s, z6.s[2]\n"
+      "fmla z26.s, z17.s, z5.s[2]\n"
+      "fmla z27.s, z17.s, z4.s[2]\n"
+      "fmla z28.s, z17.s, z3.s[2]\n"
+      "fmla z29.s, z17.s, z2.s[2]\n"
+      "fmla z30.s, z17.s, z1.s[2]\n"
+      "fmla z31.s, z17.s, z0.s[2]\n"
+      "fmla z24.s, z16.s, z7.s[3]\n"
+      "fmla z25.s, z16.s, z6.s[3]\n"
+      "fmla z26.s, z16.s, z5.s[3]\n"
+      "fmla z27.s, z16.s, z4.s[3]\n"
+      "fmla z28.s, z16.s, z3.s[3]\n"
+      "fmla z29.s, z16.s, z2.s[3]\n"
+      "fmla z30.s, z16.s, z1.s[3]\n"
+      "fmla z31.s, z16.s, z0.s[3]\n"
       "bgt 100b\n"
       "101:"  // Height 8: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x9\n"
@@ -1395,51 +1395,51 @@
       "ld1rqw { z5.s }, p0/Z, [x23]\n"
       "ld1rqw { z6.s }, p0/Z, [x22]\n"
       "ld1rqw { z7.s }, p0/Z, [x21]\n"
-      "ld1w { z8.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z8.s, z0.s[0]\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[0]\n"
+      "fmla z25.s, z16.s, z1.s[0]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
-      "fmla z29.s, z8.s, z5.s[0]\n"
-      "fmla z30.s, z8.s, z6.s[0]\n"
-      "fmla z31.s, z8.s, z7.s[0]\n"
+      "fmla z26.s, z16.s, z2.s[0]\n"
+      "fmla z27.s, z16.s, z3.s[0]\n"
+      "fmla z28.s, z16.s, z4.s[0]\n"
+      "fmla z29.s, z16.s, z5.s[0]\n"
+      "fmla z30.s, z16.s, z6.s[0]\n"
+      "fmla z31.s, z16.s, z7.s[0]\n"
       "ble 102f\n"
-      "ld1w { z9.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
-      "fmla z26.s, z9.s, z2.s[1]\n"
-      "fmla z27.s, z9.s, z3.s[1]\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "fmla z25.s, z16.s, z1.s[1]\n"
+      "fmla z26.s, z16.s, z2.s[1]\n"
+      "fmla z27.s, z16.s, z3.s[1]\n"
       "addvl x12, x12, #1\n"
-      "fmla z28.s, z9.s, z4.s[1]\n"
-      "fmla z29.s, z9.s, z5.s[1]\n"
-      "fmla z30.s, z9.s, z6.s[1]\n"
-      "fmla z31.s, z9.s, z7.s[1]\n"
+      "fmla z28.s, z16.s, z4.s[1]\n"
+      "fmla z29.s, z16.s, z5.s[1]\n"
+      "fmla z30.s, z16.s, z6.s[1]\n"
+      "fmla z31.s, z16.s, z7.s[1]\n"
       "ble 102f\n"
-      "ld1w { z10.s }, p2/Z, [x12]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
       "subs x9, x9, #0x1\n"
-      "fmla z24.s, z10.s, z0.s[2]\n"
-      "fmla z25.s, z10.s, z1.s[2]\n"
-      "fmla z26.s, z10.s, z2.s[2]\n"
-      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z24.s, z16.s, z0.s[2]\n"
+      "fmla z25.s, z16.s, z1.s[2]\n"
+      "fmla z26.s, z16.s, z2.s[2]\n"
+      "fmla z27.s, z16.s, z3.s[2]\n"
       "addvl x12, x12, #1\n"
-      "fmla z28.s, z10.s, z4.s[2]\n"
-      "fmla z29.s, z10.s, z5.s[2]\n"
-      "fmla z30.s, z10.s, z6.s[2]\n"
-      "fmla z31.s, z10.s, z7.s[2]\n"
+      "fmla z28.s, z16.s, z4.s[2]\n"
+      "fmla z29.s, z16.s, z5.s[2]\n"
+      "fmla z30.s, z16.s, z6.s[2]\n"
+      "fmla z31.s, z16.s, z7.s[2]\n"
       "ble 102f\n"
-      "ld1w { z11.s }, p2/Z, [x12]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "fmla z25.s, z11.s, z1.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [x12]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "fmla z25.s, z16.s, z1.s[3]\n"
       "addvl x12, x12, #1\n"
-      "fmla z26.s, z11.s, z2.s[3]\n"
-      "fmla z27.s, z11.s, z3.s[3]\n"
-      "fmla z28.s, z11.s, z4.s[3]\n"
-      "fmla z29.s, z11.s, z5.s[3]\n"
-      "fmla z30.s, z11.s, z6.s[3]\n"
-      "fmla z31.s, z11.s, z7.s[3]\n"
+      "fmla z26.s, z16.s, z2.s[3]\n"
+      "fmla z27.s, z16.s, z3.s[3]\n"
+      "fmla z28.s, z16.s, z4.s[3]\n"
+      "fmla z29.s, z16.s, z5.s[3]\n"
+      "fmla z30.s, z16.s, z6.s[3]\n"
+      "fmla z31.s, z16.s, z7.s[3]\n"
       "102:"  // Height 8: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x10, x10, #0x1\n"
@@ -1500,12 +1500,11 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "106:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
-      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
index ab175a3..66c106d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../bfloat.hpp"
 #include "../performance_parameters.hpp"
@@ -75,7 +75,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, float>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -100,5 +99,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
index 8d05c1f..2b2a068 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
@@ -140,22 +140,22 @@
       "b 5f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 4f\n"
-      "ld1w { z9.s }, p6/Z, [x27]\n"
-      "ld1w { z10.s }, p5/Z, [x27, #1, MUL VL]\n"
-      "zip1 z8.d, z9.d, z14.d\n"
-      "zip2 z14.d, z9.d, z14.d\n"
-      "ld1w { z11.s }, p4/Z, [x27, #2, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x27, #3, MUL VL]\n"
-      "zip1 z9.d, z10.d, z15.d\n"
-      "zip2 z15.d, z10.d, z15.d\n"
-      "ld1w { z13.s }, p2/Z, [x27, #4, MUL VL]\n"
+      "ld1w { z21.s }, p6/Z, [x27]\n"
+      "ld1w { z20.s }, p5/Z, [x27, #1, MUL VL]\n"
+      "zip1 z8.d, z21.d, z14.d\n"
+      "zip2 z14.d, z21.d, z14.d\n"
+      "ld1w { z23.s }, p4/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+      "zip1 z9.d, z20.d, z15.d\n"
+      "zip2 z15.d, z20.d, z15.d\n"
+      "ld1w { z21.s }, p2/Z, [x27, #4, MUL VL]\n"
       "ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
-      "zip1 z10.d, z11.d, z16.d\n"
-      "zip2 z16.d, z11.d, z16.d\n"
-      "zip1 z11.d, z12.d, z17.d\n"
-      "zip2 z17.d, z12.d, z17.d\n"
-      "zip1 z12.d, z13.d, z18.d\n"
-      "zip2 z18.d, z13.d, z18.d\n"
+      "zip1 z10.d, z23.d, z16.d\n"
+      "zip2 z16.d, z23.d, z16.d\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "zip1 z12.d, z21.d, z18.d\n"
+      "zip2 z18.d, z21.d, z18.d\n"
       "zip1 z13.d, z20.d, z19.d\n"
       "zip2 z19.d, z20.d, z19.d\n"
       "b 5f\n"
@@ -177,11 +177,11 @@
       "6:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 7f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
       "cbnz x26, 8f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -193,69 +193,69 @@
       "ble 10f\n"
       "9:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
-      "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
-      "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+      "ld1rqw { z24.s }, p0/Z, [x24]\n"
+      ".inst 0x658abf18  // bfcvt z24.h, p7/M, z24.s\n"
+      "uzp1 z24.h, z24.h, z24.h\n"
+      "ld1h { z21.h }, p7/Z, [x28]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x6475e708  // bfmmla z8.s, z24.h, z21.h\n"
+      ".inst 0x6474e70e  // bfmmla z14.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x6475e709  // bfmmla z9.s, z24.h, z21.h\n"
+      ".inst 0x6474e70f  // bfmmla z15.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #4, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x6475e70a  // bfmmla z10.s, z24.h, z21.h\n"
+      ".inst 0x6474e710  // bfmmla z16.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x6475e70b  // bfmmla z11.s, z24.h, z21.h\n"
+      ".inst 0x6474e711  // bfmmla z17.s, z24.h, z20.h\n"
+      "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
       "sub x25, x25, #0x4\n"
-      "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
-      "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
       "cmp x25, #0x4\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6477e70c  // bfmmla z12.s, z24.h, z23.h\n"
+      ".inst 0x6476e712  // bfmmla z18.s, z24.h, z22.h\n"
+      ".inst 0x6475e70d  // bfmmla z13.s, z24.h, z21.h\n"
       "add x24, x24, #0x10\n"
       "addvl x28, x28, #-4\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      ".inst 0x6474e713  // bfmmla z19.s, z24.h, z20.h\n"
       "bgt 9b\n"
       "10:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
-      "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
-      "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+      "ld1rqw { z23.s }, p0/Z, [x24]\n"
+      ".inst 0x658abef7  // bfcvt z23.h, p7/M, z23.s\n"
+      "uzp1 z23.h, z23.h, z23.h\n"
+      "ld1h { z21.h }, p7/Z, [x28]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x6475e6e8  // bfmmla z8.s, z23.h, z21.h\n"
+      ".inst 0x6474e6ee  // bfmmla z14.s, z23.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x6475e6e9  // bfmmla z9.s, z23.h, z21.h\n"
+      ".inst 0x6474e6ef  // bfmmla z15.s, z23.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #4, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x6475e6ea  // bfmmla z10.s, z23.h, z21.h\n"
+      ".inst 0x6474e6f0  // bfmmla z16.s, z23.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
-      "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      ".inst 0x6475e6eb  // bfmmla z11.s, z23.h, z21.h\n"
+      ".inst 0x6474e6f1  // bfmmla z17.s, z23.h, z20.h\n"
+      "ld1h { z20.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x6474e6ec  // bfmmla z12.s, z23.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x6476e6f2  // bfmmla z18.s, z23.h, z22.h\n"
+      ".inst 0x6475e6ed  // bfmmla z13.s, z23.h, z21.h\n"
+      ".inst 0x6474e6f3  // bfmmla z19.s, z23.h, z20.h\n"
       "addvl x28, x28, #-4\n"
       "11:"  // Height 1: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -270,21 +270,21 @@
       "uzp1 z13.d, z13.d, z19.d\n"
       "tbz %x[flags], #1, 12f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p7/Z, [x20]\n"
+      "ld1rw { z21.s }, p7/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p7/Z, [x20]\n"
-      "fmin z8.s, p7/M, z8.s, z1.s\n"
-      "fmin z9.s, p7/M, z9.s, z1.s\n"
-      "fmin z10.s, p7/M, z10.s, z1.s\n"
-      "fmin z11.s, p7/M, z11.s, z1.s\n"
-      "fmin z12.s, p7/M, z12.s, z1.s\n"
-      "fmin z13.s, p7/M, z13.s, z1.s\n"
-      "fmax z8.s, p7/M, z8.s, z0.s\n"
-      "fmax z9.s, p7/M, z9.s, z0.s\n"
-      "fmax z10.s, p7/M, z10.s, z0.s\n"
-      "fmax z11.s, p7/M, z11.s, z0.s\n"
-      "fmax z12.s, p7/M, z12.s, z0.s\n"
-      "fmax z13.s, p7/M, z13.s, z0.s\n"
+      "ld1rw { z20.s }, p7/Z, [x20]\n"
+      "fmin z8.s, p7/M, z8.s, z21.s\n"
+      "fmin z9.s, p7/M, z9.s, z21.s\n"
+      "fmin z10.s, p7/M, z10.s, z21.s\n"
+      "fmin z11.s, p7/M, z11.s, z21.s\n"
+      "fmin z12.s, p7/M, z12.s, z21.s\n"
+      "fmin z13.s, p7/M, z13.s, z21.s\n"
+      "fmax z8.s, p7/M, z8.s, z20.s\n"
+      "fmax z9.s, p7/M, z9.s, z20.s\n"
+      "fmax z10.s, p7/M, z10.s, z20.s\n"
+      "fmax z11.s, p7/M, z11.s, z20.s\n"
+      "fmax z12.s, p7/M, z12.s, z20.s\n"
+      "fmax z13.s, p7/M, z13.s, z20.s\n"
       "12:"  // Height 1: No activation
       "st1w { z8.s }, p6, [x27]\n"
       "st1w { z9.s }, p5, [x27, #1, MUL VL]\n"
@@ -340,29 +340,29 @@
       "16:"  // Height 2: no bias
       "tbz %x[flags], #0, 17f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x23, x27, x20, LSL #2\n"
-      "ld1w { z9.s }, p6/Z, [x27]\n"
-      "ld1w { z10.s }, p5/Z, [x27, #1, MUL VL]\n"
-      "ld1w { z11.s }, p4/Z, [x27, #2, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x27, #3, MUL VL]\n"
-      "ld1w { z13.s }, p2/Z, [x27, #4, MUL VL]\n"
+      "add x20, x27, x20, LSL #2\n"
+      "ld1w { z16.s }, p6/Z, [x27]\n"
+      "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [x27, #4, MUL VL]\n"
       "ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
-      "ld1w { z14.s }, p6/Z, [x23]\n"
-      "zip1 z8.d, z9.d, z14.d\n"
-      "zip2 z14.d, z9.d, z14.d\n"
-      "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z15.d\n"
-      "zip2 z15.d, z10.d, z15.d\n"
-      "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n"
-      "zip1 z10.d, z11.d, z16.d\n"
-      "zip2 z16.d, z11.d, z16.d\n"
-      "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n"
-      "zip1 z11.d, z12.d, z17.d\n"
-      "zip2 z17.d, z12.d, z17.d\n"
-      "zip1 z12.d, z13.d, z18.d\n"
-      "zip2 z18.d, z13.d, z18.d\n"
+      "ld1w { z14.s }, p6/Z, [x20]\n"
+      "zip1 z8.d, z16.d, z14.d\n"
+      "zip2 z14.d, z16.d, z14.d\n"
+      "ld1w { z15.s }, p5/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x20, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z15.d\n"
+      "zip2 z15.d, z17.d, z15.d\n"
+      "ld1w { z17.s }, p3/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x20, #4, MUL VL]\n"
+      "zip1 z10.d, z19.d, z16.d\n"
+      "zip2 z16.d, z19.d, z16.d\n"
+      "ld1w { z19.s }, p1/Z, [x20, #5, MUL VL]\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "zip1 z12.d, z21.d, z18.d\n"
+      "zip2 z18.d, z21.d, z18.d\n"
       "zip1 z13.d, z20.d, z19.d\n"
       "zip2 z19.d, z20.d, z19.d\n"
       "b 18f\n"
@@ -384,12 +384,12 @@
       "19:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
       "cbnz x26, 21f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -397,85 +397,85 @@
       "b 21f\n"
       "20:"  // Height 2: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "21:"  // Height 2: input setup done
       "cmp x25, #0x4\n"
       "ble 23f\n"
       "22:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      "ld1rqw { z1.s }, p0/Z, [x23]\n"
-      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
-      "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
-      "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+      "ld1rqw { z24.s }, p0/Z, [x24]\n"
+      "ld1rqw { z20.s }, p0/Z, [x23]\n"
+      ".inst 0x658abf18  // bfcvt z24.h, p7/M, z24.s\n"
+      ".inst 0x658abe94  // bfcvt z20.h, p7/M, z20.s\n"
+      "uzp1 z24.h, z24.h, z24.h\n"
+      "ld1h { z23.h }, p7/Z, [x28]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "uzp1 z20.h, z20.h, z20.h\n"
+      "trn1 z24.d, z24.d, z20.d\n"
+      "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x6477e708  // bfmmla z8.s, z24.h, z23.h\n"
+      ".inst 0x6476e70e  // bfmmla z14.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x28, #4, MUL VL]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x6475e709  // bfmmla z9.s, z24.h, z21.h\n"
+      ".inst 0x6474e70f  // bfmmla z15.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
-      "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
-      "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x6477e70a  // bfmmla z10.s, z24.h, z23.h\n"
+      ".inst 0x6476e710  // bfmmla z16.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x6475e70b  // bfmmla z11.s, z24.h, z21.h\n"
+      ".inst 0x6474e711  // bfmmla z17.s, z24.h, z20.h\n"
+      "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
       "sub x25, x25, #0x4\n"
       "cmp x25, #0x4\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6477e70c  // bfmmla z12.s, z24.h, z23.h\n"
+      ".inst 0x6476e712  // bfmmla z18.s, z24.h, z22.h\n"
+      ".inst 0x6475e70d  // bfmmla z13.s, z24.h, z21.h\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      ".inst 0x6474e713  // bfmmla z19.s, z24.h, z20.h\n"
       "addvl x28, x28, #-4\n"
       "bgt 22b\n"
       "23:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      "ld1rqw { z1.s }, p0/Z, [x23]\n"
-      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
-      "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
-      "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+      "ld1rqw { z24.s }, p0/Z, [x24]\n"
+      "ld1rqw { z20.s }, p0/Z, [x23]\n"
+      ".inst 0x658abf18  // bfcvt z24.h, p7/M, z24.s\n"
+      ".inst 0x658abe94  // bfcvt z20.h, p7/M, z20.s\n"
+      "uzp1 z24.h, z24.h, z24.h\n"
+      "ld1h { z23.h }, p7/Z, [x28]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "uzp1 z20.h, z20.h, z20.h\n"
+      "trn1 z24.d, z24.d, z20.d\n"
+      "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x6477e708  // bfmmla z8.s, z24.h, z23.h\n"
+      ".inst 0x6476e70e  // bfmmla z14.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x28, #4, MUL VL]\n"
+      "ld1h { z22.h }, p7/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x6475e709  // bfmmla z9.s, z24.h, z21.h\n"
+      ".inst 0x6474e70f  // bfmmla z15.s, z24.h, z20.h\n"
+      "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
-      "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
-      "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
+      ".inst 0x6477e70a  // bfmmla z10.s, z24.h, z23.h\n"
+      ".inst 0x6476e710  // bfmmla z16.s, z24.h, z22.h\n"
+      "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x6475e70b  // bfmmla z11.s, z24.h, z21.h\n"
+      ".inst 0x6474e711  // bfmmla z17.s, z24.h, z20.h\n"
+      "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x6477e70c  // bfmmla z12.s, z24.h, z23.h\n"
+      ".inst 0x6476e712  // bfmmla z18.s, z24.h, z22.h\n"
       "addvl x28, x28, #-4\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      ".inst 0x6475e70d  // bfmmla z13.s, z24.h, z21.h\n"
+      ".inst 0x6474e713  // bfmmla z19.s, z24.h, z20.h\n"
       "24:"  // Height 2: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
@@ -497,33 +497,33 @@
       "uzp2 z13.d, z13.d, z19.d\n"
       "tbz %x[flags], #1, 25f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p7/Z, [x20]\n"
+      "ld1rw { z20.s }, p7/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p7/Z, [x20]\n"
-      "fmin z4.s, p7/M, z4.s, z1.s\n"
-      "fmin z14.s, p7/M, z14.s, z1.s\n"
-      "fmin z15.s, p7/M, z15.s, z1.s\n"
-      "fmin z16.s, p7/M, z16.s, z1.s\n"
-      "fmin z17.s, p7/M, z17.s, z1.s\n"
-      "fmin z18.s, p7/M, z18.s, z1.s\n"
-      "fmin z8.s, p7/M, z8.s, z1.s\n"
-      "fmin z9.s, p7/M, z9.s, z1.s\n"
-      "fmin z10.s, p7/M, z10.s, z1.s\n"
-      "fmin z11.s, p7/M, z11.s, z1.s\n"
-      "fmin z12.s, p7/M, z12.s, z1.s\n"
-      "fmin z13.s, p7/M, z13.s, z1.s\n"
-      "fmax z4.s, p7/M, z4.s, z0.s\n"
-      "fmax z14.s, p7/M, z14.s, z0.s\n"
-      "fmax z15.s, p7/M, z15.s, z0.s\n"
-      "fmax z16.s, p7/M, z16.s, z0.s\n"
-      "fmax z17.s, p7/M, z17.s, z0.s\n"
-      "fmax z18.s, p7/M, z18.s, z0.s\n"
-      "fmax z8.s, p7/M, z8.s, z0.s\n"
-      "fmax z9.s, p7/M, z9.s, z0.s\n"
-      "fmax z10.s, p7/M, z10.s, z0.s\n"
-      "fmax z11.s, p7/M, z11.s, z0.s\n"
-      "fmax z12.s, p7/M, z12.s, z0.s\n"
-      "fmax z13.s, p7/M, z13.s, z0.s\n"
+      "ld1rw { z19.s }, p7/Z, [x20]\n"
+      "fmin z4.s, p7/M, z4.s, z20.s\n"
+      "fmin z14.s, p7/M, z14.s, z20.s\n"
+      "fmin z15.s, p7/M, z15.s, z20.s\n"
+      "fmin z16.s, p7/M, z16.s, z20.s\n"
+      "fmin z17.s, p7/M, z17.s, z20.s\n"
+      "fmin z18.s, p7/M, z18.s, z20.s\n"
+      "fmin z8.s, p7/M, z8.s, z20.s\n"
+      "fmin z9.s, p7/M, z9.s, z20.s\n"
+      "fmin z10.s, p7/M, z10.s, z20.s\n"
+      "fmin z11.s, p7/M, z11.s, z20.s\n"
+      "fmin z12.s, p7/M, z12.s, z20.s\n"
+      "fmin z13.s, p7/M, z13.s, z20.s\n"
+      "fmax z4.s, p7/M, z4.s, z19.s\n"
+      "fmax z14.s, p7/M, z14.s, z19.s\n"
+      "fmax z15.s, p7/M, z15.s, z19.s\n"
+      "fmax z16.s, p7/M, z16.s, z19.s\n"
+      "fmax z17.s, p7/M, z17.s, z19.s\n"
+      "fmax z18.s, p7/M, z18.s, z19.s\n"
+      "fmax z8.s, p7/M, z8.s, z19.s\n"
+      "fmax z9.s, p7/M, z9.s, z19.s\n"
+      "fmax z10.s, p7/M, z10.s, z19.s\n"
+      "fmax z11.s, p7/M, z11.s, z19.s\n"
+      "fmax z12.s, p7/M, z12.s, z19.s\n"
+      "fmax z13.s, p7/M, z13.s, z19.s\n"
       "25:"  // Height 2: No activation
       "st1w { z4.s }, p6, [x27]\n"
       "st1w { z14.s }, p5, [x27, #1, MUL VL]\n"
@@ -597,38 +597,38 @@
       "29:"  // Height 3: no bias
       "tbz %x[flags], #0, 30f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x23, x27, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "ld1w { z9.s }, p6/Z, [x27]\n"
-      "ld1w { z10.s }, p5/Z, [x27, #1, MUL VL]\n"
-      "ld1w { z11.s }, p4/Z, [x27, #2, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x27, #3, MUL VL]\n"
-      "ld1w { z13.s }, p2/Z, [x27, #4, MUL VL]\n"
+      "add x21, x27, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z16.s }, p6/Z, [x27]\n"
+      "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [x27, #4, MUL VL]\n"
       "ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
-      "ld1w { z14.s }, p6/Z, [x23]\n"
-      "zip1 z8.d, z9.d, z14.d\n"
-      "zip2 z14.d, z9.d, z14.d\n"
-      "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z15.d\n"
-      "zip2 z15.d, z10.d, z15.d\n"
-      "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n"
-      "zip1 z10.d, z11.d, z16.d\n"
-      "zip2 z16.d, z11.d, z16.d\n"
-      "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n"
-      "ld1w { z21.s }, p6/Z, [x22]\n"
-      "zip1 z11.d, z12.d, z17.d\n"
-      "zip2 z17.d, z12.d, z17.d\n"
-      "ld1w { z22.s }, p5/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z23.s }, p4/Z, [x22, #2, MUL VL]\n"
-      "zip1 z12.d, z13.d, z18.d\n"
-      "zip2 z18.d, z13.d, z18.d\n"
-      "ld1w { z24.s }, p3/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z14.s }, p6/Z, [x21]\n"
+      "zip1 z8.d, z16.d, z14.d\n"
+      "zip2 z14.d, z16.d, z14.d\n"
+      "ld1w { z15.s }, p5/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x21, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z15.d\n"
+      "zip2 z15.d, z17.d, z15.d\n"
+      "ld1w { z17.s }, p3/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x21, #4, MUL VL]\n"
+      "zip1 z10.d, z19.d, z16.d\n"
+      "zip2 z16.d, z19.d, z16.d\n"
+      "ld1w { z19.s }, p1/Z, [x21, #5, MUL VL]\n"
+      "ld1w { z21.s }, p6/Z, [x20]\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "ld1w { z22.s }, p5/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z23.s }, p4/Z, [x20, #2, MUL VL]\n"
+      "zip1 z12.d, z24.d, z18.d\n"
+      "zip2 z18.d, z24.d, z18.d\n"
+      "ld1w { z24.s }, p3/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #4, MUL VL]\n"
       "zip1 z13.d, z20.d, z19.d\n"
       "zip2 z19.d, z20.d, z19.d\n"
-      "ld1w { z4.s }, p1/Z, [x22, #5, MUL VL]\n"
+      "ld1w { z0.s }, p1/Z, [x20, #5, MUL VL]\n"
       "zip1 z20.d, z21.d, z26.d\n"
       "zip2 z26.d, z21.d, z26.d\n"
       "zip1 z21.d, z22.d, z27.d\n"
@@ -639,8 +639,8 @@
       "zip2 z29.d, z24.d, z29.d\n"
       "zip1 z24.d, z25.d, z30.d\n"
       "zip2 z30.d, z25.d, z30.d\n"
-      "zip1 z25.d, z4.d, z31.d\n"
-      "zip2 z31.d, z4.d, z31.d\n"
+      "zip1 z25.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
       "b 31f\n"
       "30:"  // Height 3: no accumulate
       "mov z8.b, #0x0\n"
@@ -672,13 +672,13 @@
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
       "cbnz x26, 34f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -687,117 +687,117 @@
       "b 34f\n"
       "33:"  // Height 3: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "34:"  // Height 3: input setup done
       "cmp x25, #0x4\n"
       "ble 36f\n"
       "35:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      "ld1rqw { z1.s }, p0/Z, [x23]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
       ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x22]\n"
-      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
       "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      ".inst 0x658abc42  // bfcvt z2.h, p7/M, z2.s\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
-      "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
-      "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
-      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "trn1 z5.d, z5.d, z0.d\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x6463e4a8  // bfmmla z8.s, z5.h, z3.h\n"
+      ".inst 0x6463e494  // bfmmla z20.s, z4.h, z3.h\n"
+      ".inst 0x6462e4ae  // bfmmla z14.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
       "sub x25, x25, #0x4\n"
-      ".inst 0x6465e45a  // bfmmla z26.s, z2.h, z5.h\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      ".inst 0x6461e4a9  // bfmmla z9.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
       "cmp x25, #0x4\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x6461e495  // bfmmla z21.s, z4.h, z1.h\n"
+      ".inst 0x6460e4af  // bfmmla z15.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6467e45b  // bfmmla z27.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x6460e49b  // bfmmla z27.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x6463e4aa  // bfmmla z10.s, z5.h, z3.h\n"
+      ".inst 0x6463e496  // bfmmla z22.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b0  // bfmmla z16.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6465e45c  // bfmmla z28.s, z2.h, z5.h\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x6462e49c  // bfmmla z28.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ab  // bfmmla z11.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
-      ".inst 0x6467e45d  // bfmmla z29.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6461e497  // bfmmla z23.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b1  // bfmmla z17.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x6463e4ac  // bfmmla z12.s, z5.h, z3.h\n"
       "addvl x28, x28, #-4\n"
-      ".inst 0x6464e458  // bfmmla z24.s, z2.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
-      ".inst 0x6465e45e  // bfmmla z30.s, z2.h, z5.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e459  // bfmmla z25.s, z2.h, z6.h\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
-      ".inst 0x6467e45f  // bfmmla z31.s, z2.h, z7.h\n"
+      ".inst 0x6463e498  // bfmmla z24.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b2  // bfmmla z18.s, z5.h, z2.h\n"
+      ".inst 0x6462e49e  // bfmmla z30.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ad  // bfmmla z13.s, z5.h, z1.h\n"
+      ".inst 0x6461e499  // bfmmla z25.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b3  // bfmmla z19.s, z5.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
       "bgt 35b\n"
       "36:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      "ld1rqw { z1.s }, p0/Z, [x23]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
       ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x22]\n"
-      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
       "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      ".inst 0x658abc42  // bfcvt z2.h, p7/M, z2.s\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
-      "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
-      "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
-      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
-      ".inst 0x6465e45a  // bfmmla z26.s, z2.h, z5.h\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
-      ".inst 0x6467e45b  // bfmmla z27.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "trn1 z5.d, z5.d, z0.d\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x6463e4a8  // bfmmla z8.s, z5.h, z3.h\n"
+      ".inst 0x6463e494  // bfmmla z20.s, z4.h, z3.h\n"
+      ".inst 0x6462e4ae  // bfmmla z14.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x6462e49a  // bfmmla z26.s, z4.h, z2.h\n"
+      ".inst 0x6461e4a9  // bfmmla z9.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x6461e495  // bfmmla z21.s, z4.h, z1.h\n"
+      ".inst 0x6460e4af  // bfmmla z15.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x6460e49b  // bfmmla z27.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
-      ".inst 0x6465e45c  // bfmmla z28.s, z2.h, z5.h\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
-      ".inst 0x6467e45d  // bfmmla z29.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6463e4aa  // bfmmla z10.s, z5.h, z3.h\n"
+      ".inst 0x6463e496  // bfmmla z22.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b0  // bfmmla z16.s, z5.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x6462e49c  // bfmmla z28.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ab  // bfmmla z11.s, z5.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x6461e497  // bfmmla z23.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b1  // bfmmla z17.s, z5.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x6463e4ac  // bfmmla z12.s, z5.h, z3.h\n"
       "addvl x28, x28, #-4\n"
-      ".inst 0x6464e458  // bfmmla z24.s, z2.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
-      ".inst 0x6465e45e  // bfmmla z30.s, z2.h, z5.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e459  // bfmmla z25.s, z2.h, z6.h\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
-      ".inst 0x6467e45f  // bfmmla z31.s, z2.h, z7.h\n"
+      ".inst 0x6463e498  // bfmmla z24.s, z4.h, z3.h\n"
+      ".inst 0x6462e4b2  // bfmmla z18.s, z5.h, z2.h\n"
+      ".inst 0x6462e49e  // bfmmla z30.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ad  // bfmmla z13.s, z5.h, z1.h\n"
+      ".inst 0x6461e499  // bfmmla z25.s, z4.h, z1.h\n"
+      ".inst 0x6460e4b3  // bfmmla z19.s, z5.h, z0.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
       "37:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
@@ -826,45 +826,45 @@
       "uzp1 z25.d, z25.d, z31.d\n"
       "tbz %x[flags], #1, 38f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p7/Z, [x20]\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
       "ld1rw { z0.s }, p7/Z, [x20]\n"
-      "fmin z4.s, p7/M, z4.s, z1.s\n"
-      "fmin z14.s, p7/M, z14.s, z1.s\n"
-      "fmin z15.s, p7/M, z15.s, z1.s\n"
-      "fmin z16.s, p7/M, z16.s, z1.s\n"
-      "fmin z17.s, p7/M, z17.s, z1.s\n"
-      "fmin z18.s, p7/M, z18.s, z1.s\n"
-      "fmin z8.s, p7/M, z8.s, z1.s\n"
-      "fmin z9.s, p7/M, z9.s, z1.s\n"
-      "fmin z10.s, p7/M, z10.s, z1.s\n"
-      "fmin z11.s, p7/M, z11.s, z1.s\n"
-      "fmin z12.s, p7/M, z12.s, z1.s\n"
-      "fmin z13.s, p7/M, z13.s, z1.s\n"
-      "fmin z20.s, p7/M, z20.s, z1.s\n"
-      "fmin z21.s, p7/M, z21.s, z1.s\n"
-      "fmin z22.s, p7/M, z22.s, z1.s\n"
-      "fmin z23.s, p7/M, z23.s, z1.s\n"
-      "fmin z24.s, p7/M, z24.s, z1.s\n"
-      "fmin z25.s, p7/M, z25.s, z1.s\n"
-      "fmax z4.s, p7/M, z4.s, z0.s\n"
-      "fmax z14.s, p7/M, z14.s, z0.s\n"
-      "fmax z15.s, p7/M, z15.s, z0.s\n"
-      "fmax z16.s, p7/M, z16.s, z0.s\n"
-      "fmax z17.s, p7/M, z17.s, z0.s\n"
-      "fmax z18.s, p7/M, z18.s, z0.s\n"
-      "fmax z8.s, p7/M, z8.s, z0.s\n"
-      "fmax z9.s, p7/M, z9.s, z0.s\n"
-      "fmax z10.s, p7/M, z10.s, z0.s\n"
-      "fmax z11.s, p7/M, z11.s, z0.s\n"
-      "fmax z12.s, p7/M, z12.s, z0.s\n"
-      "fmax z13.s, p7/M, z13.s, z0.s\n"
-      "fmax z20.s, p7/M, z20.s, z0.s\n"
-      "fmax z21.s, p7/M, z21.s, z0.s\n"
-      "fmax z22.s, p7/M, z22.s, z0.s\n"
-      "fmax z23.s, p7/M, z23.s, z0.s\n"
-      "fmax z24.s, p7/M, z24.s, z0.s\n"
-      "fmax z25.s, p7/M, z25.s, z0.s\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z19.s }, p7/Z, [x20]\n"
+      "fmin z4.s, p7/M, z4.s, z0.s\n"
+      "fmin z14.s, p7/M, z14.s, z0.s\n"
+      "fmin z15.s, p7/M, z15.s, z0.s\n"
+      "fmin z16.s, p7/M, z16.s, z0.s\n"
+      "fmin z17.s, p7/M, z17.s, z0.s\n"
+      "fmin z18.s, p7/M, z18.s, z0.s\n"
+      "fmin z8.s, p7/M, z8.s, z0.s\n"
+      "fmin z9.s, p7/M, z9.s, z0.s\n"
+      "fmin z10.s, p7/M, z10.s, z0.s\n"
+      "fmin z11.s, p7/M, z11.s, z0.s\n"
+      "fmin z12.s, p7/M, z12.s, z0.s\n"
+      "fmin z13.s, p7/M, z13.s, z0.s\n"
+      "fmin z20.s, p7/M, z20.s, z0.s\n"
+      "fmin z21.s, p7/M, z21.s, z0.s\n"
+      "fmin z22.s, p7/M, z22.s, z0.s\n"
+      "fmin z23.s, p7/M, z23.s, z0.s\n"
+      "fmin z24.s, p7/M, z24.s, z0.s\n"
+      "fmin z25.s, p7/M, z25.s, z0.s\n"
+      "fmax z4.s, p7/M, z4.s, z19.s\n"
+      "fmax z14.s, p7/M, z14.s, z19.s\n"
+      "fmax z15.s, p7/M, z15.s, z19.s\n"
+      "fmax z16.s, p7/M, z16.s, z19.s\n"
+      "fmax z17.s, p7/M, z17.s, z19.s\n"
+      "fmax z18.s, p7/M, z18.s, z19.s\n"
+      "fmax z8.s, p7/M, z8.s, z19.s\n"
+      "fmax z9.s, p7/M, z9.s, z19.s\n"
+      "fmax z10.s, p7/M, z10.s, z19.s\n"
+      "fmax z11.s, p7/M, z11.s, z19.s\n"
+      "fmax z12.s, p7/M, z12.s, z19.s\n"
+      "fmax z13.s, p7/M, z13.s, z19.s\n"
+      "fmax z20.s, p7/M, z20.s, z19.s\n"
+      "fmax z21.s, p7/M, z21.s, z19.s\n"
+      "fmax z22.s, p7/M, z22.s, z19.s\n"
+      "fmax z23.s, p7/M, z23.s, z19.s\n"
+      "fmax z24.s, p7/M, z24.s, z19.s\n"
+      "fmax z25.s, p7/M, z25.s, z19.s\n"
       "38:"  // Height 3: No activation
       "st1w { z4.s }, p6, [x27]\n"
       "st1w { z14.s }, p5, [x27, #1, MUL VL]\n"
@@ -947,57 +947,57 @@
       "42:"  // Height 4: no bias
       "tbz %x[flags], #0, 43f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x23, x27, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "ld1w { z9.s }, p6/Z, [x27]\n"
+      "add x22, x27, x20, LSL #2\n"
       "add x21, x22, x20, LSL #2\n"
-      "ld1w { z10.s }, p5/Z, [x27, #1, MUL VL]\n"
-      "ld1w { z11.s }, p4/Z, [x27, #2, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x27, #3, MUL VL]\n"
-      "ld1w { z13.s }, p2/Z, [x27, #4, MUL VL]\n"
+      "ld1w { z16.s }, p6/Z, [x27]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [x27, #4, MUL VL]\n"
       "ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n"
-      "ld1w { z14.s }, p6/Z, [x23]\n"
-      "zip1 z8.d, z9.d, z14.d\n"
-      "zip2 z14.d, z9.d, z14.d\n"
-      "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z15.d\n"
-      "zip2 z15.d, z10.d, z15.d\n"
-      "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n"
-      "zip1 z10.d, z11.d, z16.d\n"
-      "zip2 z16.d, z11.d, z16.d\n"
-      "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n"
-      "ld1w { z21.s }, p6/Z, [x22]\n"
-      "zip1 z11.d, z12.d, z17.d\n"
-      "zip2 z17.d, z12.d, z17.d\n"
-      "ld1w { z22.s }, p5/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z23.s }, p4/Z, [x22, #2, MUL VL]\n"
-      "zip1 z12.d, z13.d, z18.d\n"
-      "zip2 z18.d, z13.d, z18.d\n"
-      "ld1w { z24.s }, p3/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z14.s }, p6/Z, [x22]\n"
+      "zip1 z8.d, z16.d, z14.d\n"
+      "zip2 z14.d, z16.d, z14.d\n"
+      "ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z15.d\n"
+      "zip2 z15.d, z17.d, z15.d\n"
+      "ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "zip1 z10.d, z19.d, z16.d\n"
+      "zip2 z16.d, z19.d, z16.d\n"
+      "ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n"
+      "ld1w { z21.s }, p6/Z, [x21]\n"
+      "zip1 z11.d, z22.d, z17.d\n"
+      "zip2 z17.d, z22.d, z17.d\n"
+      "ld1w { z22.s }, p5/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z23.s }, p4/Z, [x21, #2, MUL VL]\n"
+      "zip1 z12.d, z24.d, z18.d\n"
+      "zip2 z18.d, z24.d, z18.d\n"
+      "ld1w { z24.s }, p3/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x21, #4, MUL VL]\n"
       "zip1 z13.d, z20.d, z19.d\n"
       "zip2 z19.d, z20.d, z19.d\n"
-      "ld1w { z4.s }, p1/Z, [x22, #5, MUL VL]\n"
-      "ld1w { z26.s }, p6/Z, [x21]\n"
+      "ld1w { z0.s }, p1/Z, [x21, #5, MUL VL]\n"
+      "ld1w { z26.s }, p6/Z, [x20]\n"
       "zip1 z20.d, z21.d, z26.d\n"
       "zip2 z26.d, z21.d, z26.d\n"
-      "ld1w { z27.s }, p5/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z28.s }, p4/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p5/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20, #2, MUL VL]\n"
       "zip1 z21.d, z22.d, z27.d\n"
       "zip2 z27.d, z22.d, z27.d\n"
-      "ld1w { z29.s }, p3/Z, [x21, #3, MUL VL]\n"
-      "ld1w { z30.s }, p2/Z, [x21, #4, MUL VL]\n"
+      "ld1w { z29.s }, p3/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #4, MUL VL]\n"
       "zip1 z22.d, z23.d, z28.d\n"
       "zip2 z28.d, z23.d, z28.d\n"
-      "ld1w { z31.s }, p1/Z, [x21, #5, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20, #5, MUL VL]\n"
       "zip1 z23.d, z24.d, z29.d\n"
       "zip2 z29.d, z24.d, z29.d\n"
       "zip1 z24.d, z25.d, z30.d\n"
       "zip2 z30.d, z25.d, z30.d\n"
-      "zip1 z25.d, z4.d, z31.d\n"
-      "zip2 z31.d, z4.d, z31.d\n"
+      "zip1 z25.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
       "b 44f\n"
       "43:"  // Height 4: no accumulate
       "mov z8.b, #0x0\n"
@@ -1029,14 +1029,14 @@
       "45:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 46f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
       "cbnz x26, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20, LSL #2\n"
@@ -1046,127 +1046,127 @@
       "b 47f\n"
       "46:"  // Height 4: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "47:"  // Height 4: input setup done
       "cmp x25, #0x4\n"
       "ble 49f\n"
       "48:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      "ld1rqw { z1.s }, p0/Z, [x23]\n"
-      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x22]\n"
-      "ld1rqw { z3.s }, p0/Z, [x21]\n"
-      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
-      ".inst 0x658abc42  // bfcvt z2.h, p7/M, z2.s\n"
-      ".inst 0x658abc63  // bfcvt z3.h, p7/M, z3.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
-      "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
-      "uzp1 z3.h, z3.h, z3.h\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      "ld1rqw { z7.s }, p0/Z, [x24]\n"
+      "ld1rqw { z6.s }, p0/Z, [x23]\n"
+      ".inst 0x658abce7  // bfcvt z7.h, p7/M, z7.s\n"
+      "ld1rqw { z5.s }, p0/Z, [x22]\n"
+      "ld1rqw { z4.s }, p0/Z, [x21]\n"
+      ".inst 0x658abcc6  // bfcvt z6.h, p7/M, z6.s\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "uzp1 z7.h, z7.h, z7.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "trn1 z7.d, z7.d, z6.d\n"
+      ".inst 0x6463e4e8  // bfmmla z8.s, z7.h, z3.h\n"
       "sub x25, x25, #0x4\n"
-      "trn1 z2.d, z2.d, z3.d\n"
-      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
-      ".inst 0x6465e45a  // bfmmla z26.s, z2.h, z5.h\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
+      "trn1 z5.d, z5.d, z4.d\n"
+      ".inst 0x6463e4b4  // bfmmla z20.s, z5.h, z3.h\n"
+      ".inst 0x6462e4ee  // bfmmla z14.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
       "cmp x25, #0x4\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x6461e4b5  // bfmmla z21.s, z5.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6467e45b  // bfmmla z27.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x6460e4bb  // bfmmla z27.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x6463e4ea  // bfmmla z10.s, z7.h, z3.h\n"
+      ".inst 0x6463e4b6  // bfmmla z22.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f0  // bfmmla z16.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6465e45c  // bfmmla z28.s, z2.h, z5.h\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x6462e4bc  // bfmmla z28.s, z5.h, z2.h\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x6461e4b7  // bfmmla z23.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f1  // bfmmla z17.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      ".inst 0x6467e45d  // bfmmla z29.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x6463e4ec  // bfmmla z12.s, z7.h, z3.h\n"
       "addvl x28, x28, #-4\n"
-      ".inst 0x6464e458  // bfmmla z24.s, z2.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
-      ".inst 0x6465e45e  // bfmmla z30.s, z2.h, z5.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e459  // bfmmla z25.s, z2.h, z6.h\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
-      ".inst 0x6467e45f  // bfmmla z31.s, z2.h, z7.h\n"
+      ".inst 0x6463e4b8  // bfmmla z24.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f2  // bfmmla z18.s, z7.h, z2.h\n"
+      ".inst 0x6462e4be  // bfmmla z30.s, z5.h, z2.h\n"
+      ".inst 0x6461e4ed  // bfmmla z13.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b9  // bfmmla z25.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f3  // bfmmla z19.s, z7.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
       "bgt 48b\n"
       "49:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x25\n"
-      "ld1rqw { z0.s }, p0/Z, [x24]\n"
-      "ld1rqw { z1.s }, p0/Z, [x23]\n"
-      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x22]\n"
-      "ld1rqw { z3.s }, p0/Z, [x21]\n"
-      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
-      ".inst 0x658abc42  // bfcvt z2.h, p7/M, z2.s\n"
-      ".inst 0x658abc63  // bfcvt z3.h, p7/M, z3.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z4.h }, p7/Z, [x28]\n"
-      "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n"
-      "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n"
-      "uzp1 z3.h, z3.h, z3.h\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
-      "trn1 z2.d, z2.d, z3.d\n"
-      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
-      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n"
-      ".inst 0x6465e45a  // bfmmla z26.s, z2.h, z5.h\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n"
-      ".inst 0x6467e45b  // bfmmla z27.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n"
+      "ld1rqw { z7.s }, p0/Z, [x24]\n"
+      "ld1rqw { z6.s }, p0/Z, [x23]\n"
+      ".inst 0x658abce7  // bfcvt z7.h, p7/M, z7.s\n"
+      "ld1rqw { z5.s }, p0/Z, [x22]\n"
+      "ld1rqw { z4.s }, p0/Z, [x21]\n"
+      ".inst 0x658abcc6  // bfcvt z6.h, p7/M, z6.s\n"
+      ".inst 0x658abca5  // bfcvt z5.h, p7/M, z5.s\n"
+      ".inst 0x658abc84  // bfcvt z4.h, p7/M, z4.s\n"
+      "uzp1 z7.h, z7.h, z7.h\n"
+      "ld1h { z3.h }, p7/Z, [x28]\n"
+      "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "trn1 z7.d, z7.d, z6.d\n"
+      ".inst 0x6463e4e8  // bfmmla z8.s, z7.h, z3.h\n"
+      "trn1 z5.d, z5.d, z4.d\n"
+      ".inst 0x6463e4b4  // bfmmla z20.s, z5.h, z3.h\n"
+      ".inst 0x6462e4ee  // bfmmla z14.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x6462e4ba  // bfmmla z26.s, z5.h, z2.h\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x6461e4b5  // bfmmla z21.s, z5.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x6460e4bb  // bfmmla z27.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
-      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
-      "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n"
-      ".inst 0x6465e45c  // bfmmla z28.s, z2.h, z5.h\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n"
-      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
-      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n"
-      ".inst 0x6467e45d  // bfmmla z29.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n"
-      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6463e4ea  // bfmmla z10.s, z7.h, z3.h\n"
+      ".inst 0x6463e4b6  // bfmmla z22.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f0  // bfmmla z16.s, z7.h, z2.h\n"
+      "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x6462e4bc  // bfmmla z28.s, z5.h, z2.h\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x6461e4b7  // bfmmla z23.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f1  // bfmmla z17.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x6460e4bd  // bfmmla z29.s, z5.h, z0.h\n"
+      "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x6463e4ec  // bfmmla z12.s, z7.h, z3.h\n"
       "addvl x28, x28, #-4\n"
-      ".inst 0x6464e458  // bfmmla z24.s, z2.h, z4.h\n"
-      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
-      ".inst 0x6465e45e  // bfmmla z30.s, z2.h, z5.h\n"
-      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
-      ".inst 0x6466e459  // bfmmla z25.s, z2.h, z6.h\n"
-      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
-      ".inst 0x6467e45f  // bfmmla z31.s, z2.h, z7.h\n"
+      ".inst 0x6463e4b8  // bfmmla z24.s, z5.h, z3.h\n"
+      ".inst 0x6462e4f2  // bfmmla z18.s, z7.h, z2.h\n"
+      ".inst 0x6462e4be  // bfmmla z30.s, z5.h, z2.h\n"
+      ".inst 0x6461e4ed  // bfmmla z13.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b9  // bfmmla z25.s, z5.h, z1.h\n"
+      ".inst 0x6460e4f3  // bfmmla z19.s, z7.h, z0.h\n"
+      ".inst 0x6460e4bf  // bfmmla z31.s, z5.h, z0.h\n"
       "50:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x26, x26, #0x1\n"
@@ -1295,7 +1295,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "54:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1303,4 +1302,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
index b7c9aca..15b7dd7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../bfloat.hpp"
 #include "../performance_parameters.hpp"
@@ -75,7 +75,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, float>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -100,5 +99,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp
index 23d7ff9..0d2b47e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp
@@ -133,16 +133,16 @@
       "b 5f\n"
       "3:"  // Height 1: no bias
       "tbz %x[flags], #0, 4f\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x9]\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "zip1 z8.d, z16.d, z12.d\n"
+      "zip2 z12.d, z16.d, z12.d\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
       "b 5f\n"
@@ -160,11 +160,11 @@
       "6:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 7f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 8f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -176,51 +176,51 @@
       "ble 10f\n"
       "9:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
-      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1rqw { z18.s }, p0/Z, [x26]\n"
+      ".inst 0x658ab652  // bfcvt z18.h, p5/M, z18.s\n"
+      "uzp1 z18.h, z18.h, z18.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
       "add x26, x26, #0x10\n"
       "addvl x10, x10, #8\n"
       "bgt 9b\n"
       "10:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
-      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      "ld1rqw { z18.s }, p0/Z, [x26]\n"
+      ".inst 0x658ab652  // bfcvt z18.h, p5/M, z18.s\n"
+      "uzp1 z18.h, z18.h, z18.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6471e648  // bfmmla z8.s, z18.h, z17.h\n"
+      ".inst 0x6470e64c  // bfmmla z12.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e649  // bfmmla z9.s, z18.h, z17.h\n"
+      ".inst 0x6470e64d  // bfmmla z13.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e64a  // bfmmla z10.s, z18.h, z17.h\n"
+      ".inst 0x6470e64e  // bfmmla z14.s, z18.h, z16.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6471e64b  // bfmmla z11.s, z18.h, z17.h\n"
+      ".inst 0x6470e64f  // bfmmla z15.s, z18.h, z16.h\n"
       "addvl x10, x10, #8\n"
       "11:"  // Height 1: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -233,17 +233,17 @@
       "uzp1 z11.d, z11.d, z15.d\n"
       "tbz %x[flags], #1, 12f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
       "12:"  // Height 1: No activation
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
@@ -287,21 +287,21 @@
       "16:"  // Height 2: no bias
       "tbz %x[flags], #0, 17f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
       "b 18f\n"
@@ -319,12 +319,12 @@
       "19:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 20f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 21f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -332,67 +332,67 @@
       "b 21f\n"
       "20:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
       "21:"  // Height 2: input setup done
       "cmp x27, #0x4\n"
       "ble 23f\n"
       "22:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
-      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
-      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1rqw { z19.s }, p0/Z, [x26]\n"
+      "ld1rqw { z18.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab673  // bfcvt z19.h, p5/M, z19.s\n"
+      ".inst 0x658ab652  // bfcvt z18.h, p5/M, z18.s\n"
+      "uzp1 z19.h, z19.h, z19.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z18.h, z18.h, z18.h\n"
+      "trn1 z19.d, z19.d, z18.d\n"
+      ".inst 0x6471e668  // bfmmla z8.s, z19.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6470e66c  // bfmmla z12.s, z19.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e669  // bfmmla z9.s, z19.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6470e66d  // bfmmla z13.s, z19.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e66a  // bfmmla z10.s, z19.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6470e66e  // bfmmla z14.s, z19.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      ".inst 0x6471e66b  // bfmmla z11.s, z19.h, z17.h\n"
+      ".inst 0x6470e66f  // bfmmla z15.s, z19.h, z16.h\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "addvl x10, x10, #8\n"
       "bgt 22b\n"
       "23:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
-      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
-      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "ld1rqw { z19.s }, p0/Z, [x26]\n"
+      "ld1rqw { z18.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab673  // bfcvt z19.h, p5/M, z19.s\n"
+      ".inst 0x658ab652  // bfcvt z18.h, p5/M, z18.s\n"
+      "uzp1 z19.h, z19.h, z19.h\n"
+      "ld1h { z17.h }, p5/Z, [x10]\n"
+      "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z18.h, z18.h, z18.h\n"
+      "trn1 z19.d, z19.d, z18.d\n"
+      ".inst 0x6471e668  // bfmmla z8.s, z19.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6470e66c  // bfmmla z12.s, z19.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6471e669  // bfmmla z9.s, z19.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6470e66d  // bfmmla z13.s, z19.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6471e66a  // bfmmla z10.s, z19.h, z17.h\n"
+      "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6470e66e  // bfmmla z14.s, z19.h, z16.h\n"
+      "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6471e66b  // bfmmla z11.s, z19.h, z17.h\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      ".inst 0x6470e66f  // bfmmla z15.s, z19.h, z16.h\n"
       "24:"  // Height 2: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -410,25 +410,25 @@
       "uzp2 z11.d, z11.d, z15.d\n"
       "tbz %x[flags], #1, 25f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z17.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z6.s, p5/M, z6.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmax z6.s, p5/M, z6.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "ld1rw { z16.s }, p5/Z, [x20]\n"
+      "fmin z6.s, p5/M, z6.s, z17.s\n"
+      "fmin z12.s, p5/M, z12.s, z17.s\n"
+      "fmin z13.s, p5/M, z13.s, z17.s\n"
+      "fmin z14.s, p5/M, z14.s, z17.s\n"
+      "fmin z8.s, p5/M, z8.s, z17.s\n"
+      "fmin z9.s, p5/M, z9.s, z17.s\n"
+      "fmin z10.s, p5/M, z10.s, z17.s\n"
+      "fmin z11.s, p5/M, z11.s, z17.s\n"
+      "fmax z6.s, p5/M, z6.s, z16.s\n"
+      "fmax z12.s, p5/M, z12.s, z16.s\n"
+      "fmax z13.s, p5/M, z13.s, z16.s\n"
+      "fmax z14.s, p5/M, z14.s, z16.s\n"
+      "fmax z8.s, p5/M, z8.s, z16.s\n"
+      "fmax z9.s, p5/M, z9.s, z16.s\n"
+      "fmax z10.s, p5/M, z10.s, z16.s\n"
+      "fmax z11.s, p5/M, z11.s, z16.s\n"
       "25:"  // Height 2: No activation
       "st1w { z6.s }, p4, [x9]\n"
       "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -484,28 +484,28 @@
       "29:"  // Height 3: no bias
       "tbz %x[flags], #0, 30f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x24]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x20]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
       "zip1 z17.d, z18.d, z21.d\n"
@@ -537,13 +537,13 @@
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 34f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -552,91 +552,91 @@
       "b 34f\n"
       "33:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
       "34:"  // Height 3: input setup done
       "cmp x27, #0x4\n"
       "ble 36f\n"
       "35:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
-      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "ld1rqw { z28.s }, p0/Z, [x26]\n"
+      "ld1rqw { z27.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab79c  // bfcvt z28.h, p5/M, z28.s\n"
+      "ld1rqw { z26.s }, p0/Z, [x24]\n"
+      ".inst 0x658ab77b  // bfcvt z27.h, p5/M, z27.s\n"
+      "uzp1 z28.h, z28.h, z28.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "uzp1 z27.h, z27.h, z27.h\n"
+      ".inst 0x658ab75a  // bfcvt z26.h, p5/M, z26.s\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
       "sub x27, x27, #0x4\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "trn1 z28.d, z28.d, z27.d\n"
+      "uzp1 z26.h, z26.h, z26.h\n"
+      ".inst 0x6479e788  // bfmmla z8.s, z28.h, z25.h\n"
       "cmp x27, #0x4\n"
-      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e78c  // bfmmla z12.s, z28.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e789  // bfmmla z9.s, z28.h, z25.h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
-      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      ".inst 0x6478e78d  // bfmmla z13.s, z28.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
-      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
-      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6479e78a  // bfmmla z10.s, z28.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      ".inst 0x6478e78e  // bfmmla z14.s, z28.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6479e78b  // bfmmla z11.s, z28.h, z25.h\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      ".inst 0x6478e78f  // bfmmla z15.s, z28.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
       "bgt 35b\n"
       "36:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
-      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
-      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
-      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
-      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
-      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "ld1rqw { z28.s }, p0/Z, [x26]\n"
+      "ld1rqw { z27.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab79c  // bfcvt z28.h, p5/M, z28.s\n"
+      "ld1rqw { z26.s }, p0/Z, [x24]\n"
+      ".inst 0x658ab77b  // bfcvt z27.h, p5/M, z27.s\n"
+      "uzp1 z28.h, z28.h, z28.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "uzp1 z27.h, z27.h, z27.h\n"
+      ".inst 0x658ab75a  // bfcvt z26.h, p5/M, z26.s\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "trn1 z28.d, z28.d, z27.d\n"
+      "uzp1 z26.h, z26.h, z26.h\n"
+      ".inst 0x6479e788  // bfmmla z8.s, z28.h, z25.h\n"
+      ".inst 0x6479e750  // bfmmla z16.s, z26.h, z25.h\n"
+      ".inst 0x6478e78c  // bfmmla z12.s, z28.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6478e754  // bfmmla z20.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e789  // bfmmla z9.s, z28.h, z25.h\n"
+      ".inst 0x6479e751  // bfmmla z17.s, z26.h, z25.h\n"
+      ".inst 0x6478e78d  // bfmmla z13.s, z28.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6478e755  // bfmmla z21.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6479e78a  // bfmmla z10.s, z28.h, z25.h\n"
+      ".inst 0x6479e752  // bfmmla z18.s, z26.h, z25.h\n"
+      ".inst 0x6478e78e  // bfmmla z14.s, z28.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e756  // bfmmla z22.s, z26.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6479e78b  // bfmmla z11.s, z28.h, z25.h\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
+      ".inst 0x6479e753  // bfmmla z19.s, z26.h, z25.h\n"
+      ".inst 0x6478e78f  // bfmmla z15.s, z28.h, z24.h\n"
+      ".inst 0x6478e757  // bfmmla z23.s, z26.h, z24.h\n"
       "37:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -659,33 +659,33 @@
       "uzp1 z19.d, z19.d, z23.d\n"
       "tbz %x[flags], #1, 38f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z25.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z6.s, p5/M, z6.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z6.s, p5/M, z6.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
+      "fmin z6.s, p5/M, z6.s, z25.s\n"
+      "fmin z12.s, p5/M, z12.s, z25.s\n"
+      "fmin z13.s, p5/M, z13.s, z25.s\n"
+      "fmin z14.s, p5/M, z14.s, z25.s\n"
+      "fmin z8.s, p5/M, z8.s, z25.s\n"
+      "fmin z9.s, p5/M, z9.s, z25.s\n"
+      "fmin z10.s, p5/M, z10.s, z25.s\n"
+      "fmin z11.s, p5/M, z11.s, z25.s\n"
+      "fmin z16.s, p5/M, z16.s, z25.s\n"
+      "fmin z17.s, p5/M, z17.s, z25.s\n"
+      "fmin z18.s, p5/M, z18.s, z25.s\n"
+      "fmin z19.s, p5/M, z19.s, z25.s\n"
+      "fmax z6.s, p5/M, z6.s, z24.s\n"
+      "fmax z12.s, p5/M, z12.s, z24.s\n"
+      "fmax z13.s, p5/M, z13.s, z24.s\n"
+      "fmax z14.s, p5/M, z14.s, z24.s\n"
+      "fmax z8.s, p5/M, z8.s, z24.s\n"
+      "fmax z9.s, p5/M, z9.s, z24.s\n"
+      "fmax z10.s, p5/M, z10.s, z24.s\n"
+      "fmax z11.s, p5/M, z11.s, z24.s\n"
+      "fmax z16.s, p5/M, z16.s, z24.s\n"
+      "fmax z17.s, p5/M, z17.s, z24.s\n"
+      "fmax z18.s, p5/M, z18.s, z24.s\n"
+      "fmax z19.s, p5/M, z19.s, z24.s\n"
       "38:"  // Height 3: No activation
       "st1w { z6.s }, p4, [x9]\n"
       "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -745,37 +745,37 @@
       "42:"  // Height 4: no bias
       "tbz %x[flags], #0, 43f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x24]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x21]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
+      "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z17.d, z18.d, z21.d\n"
       "zip2 z21.d, z18.d, z21.d\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z18.d, z19.d, z22.d\n"
       "zip2 z22.d, z19.d, z22.d\n"
       "zip1 z19.d, z24.d, z23.d\n"
@@ -803,14 +803,14 @@
       "45:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 46f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -820,101 +820,101 @@
       "b 47f\n"
       "46:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
       "47:"  // Height 4: input setup done
       "cmp x27, #0x4\n"
       "ble 49f\n"
       "48:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
-      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      "ld1rqw { z3.s }, p0/Z, [x23]\n"
-      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
-      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
-      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
+      "ld1rqw { z29.s }, p0/Z, [x26]\n"
+      "ld1rqw { z28.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab7bd  // bfcvt z29.h, p5/M, z29.s\n"
+      "ld1rqw { z27.s }, p0/Z, [x24]\n"
+      "ld1rqw { z26.s }, p0/Z, [x23]\n"
+      ".inst 0x658ab79c  // bfcvt z28.h, p5/M, z28.s\n"
+      ".inst 0x658ab77b  // bfcvt z27.h, p5/M, z27.s\n"
+      ".inst 0x658ab75a  // bfcvt z26.h, p5/M, z26.s\n"
+      "uzp1 z29.h, z29.h, z29.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z28.h, z28.h, z28.h\n"
+      "uzp1 z27.h, z27.h, z27.h\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "uzp1 z3.h, z3.h, z3.h\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "uzp1 z26.h, z26.h, z26.h\n"
+      "trn1 z29.d, z29.d, z28.d\n"
+      ".inst 0x6479e7a8  // bfmmla z8.s, z29.h, z25.h\n"
       "add x26, x26, #0x10\n"
-      "trn1 z2.d, z2.d, z3.d\n"
-      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      "trn1 z27.d, z27.d, z26.d\n"
+      ".inst 0x6479e770  // bfmmla z16.s, z27.h, z25.h\n"
+      ".inst 0x6478e7ac  // bfmmla z12.s, z29.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6478e774  // bfmmla z20.s, z27.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e7a9  // bfmmla z9.s, z29.h, z25.h\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
-      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6479e771  // bfmmla z17.s, z27.h, z25.h\n"
+      ".inst 0x6478e7ad  // bfmmla z13.s, z29.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      ".inst 0x6478e775  // bfmmla z21.s, z27.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6479e7aa  // bfmmla z10.s, z29.h, z25.h\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
-      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      ".inst 0x6479e772  // bfmmla z18.s, z27.h, z25.h\n"
+      ".inst 0x6478e7ae  // bfmmla z14.s, z29.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e776  // bfmmla z22.s, z27.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6479e7ab  // bfmmla z11.s, z29.h, z25.h\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
+      ".inst 0x6479e773  // bfmmla z19.s, z27.h, z25.h\n"
+      ".inst 0x6478e7af  // bfmmla z15.s, z29.h, z24.h\n"
+      ".inst 0x6478e777  // bfmmla z23.s, z27.h, z24.h\n"
       "bgt 48b\n"
       "49:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
-      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      "ld1rqw { z3.s }, p0/Z, [x23]\n"
-      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
-      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
-      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
-      "uzp1 z3.h, z3.h, z3.h\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
-      "trn1 z2.d, z2.d, z3.d\n"
-      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
-      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
-      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
-      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "ld1rqw { z29.s }, p0/Z, [x26]\n"
+      "ld1rqw { z28.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab7bd  // bfcvt z29.h, p5/M, z29.s\n"
+      "ld1rqw { z27.s }, p0/Z, [x24]\n"
+      "ld1rqw { z26.s }, p0/Z, [x23]\n"
+      ".inst 0x658ab79c  // bfcvt z28.h, p5/M, z28.s\n"
+      ".inst 0x658ab77b  // bfcvt z27.h, p5/M, z27.s\n"
+      ".inst 0x658ab75a  // bfcvt z26.h, p5/M, z26.s\n"
+      "uzp1 z29.h, z29.h, z29.h\n"
+      "ld1h { z25.h }, p5/Z, [x10]\n"
+      "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z28.h, z28.h, z28.h\n"
+      "uzp1 z27.h, z27.h, z27.h\n"
+      "uzp1 z26.h, z26.h, z26.h\n"
+      "trn1 z29.d, z29.d, z28.d\n"
+      ".inst 0x6479e7a8  // bfmmla z8.s, z29.h, z25.h\n"
+      "trn1 z27.d, z27.d, z26.d\n"
+      ".inst 0x6479e770  // bfmmla z16.s, z27.h, z25.h\n"
+      ".inst 0x6478e7ac  // bfmmla z12.s, z29.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6478e774  // bfmmla z20.s, z27.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6479e7a9  // bfmmla z9.s, z29.h, z25.h\n"
+      ".inst 0x6479e771  // bfmmla z17.s, z27.h, z25.h\n"
+      ".inst 0x6478e7ad  // bfmmla z13.s, z29.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6478e775  // bfmmla z21.s, z27.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6479e7aa  // bfmmla z10.s, z29.h, z25.h\n"
+      ".inst 0x6479e772  // bfmmla z18.s, z27.h, z25.h\n"
+      ".inst 0x6478e7ae  // bfmmla z14.s, z29.h, z24.h\n"
+      "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6478e776  // bfmmla z22.s, z27.h, z24.h\n"
+      "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6479e7ab  // bfmmla z11.s, z29.h, z25.h\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
+      ".inst 0x6479e773  // bfmmla z19.s, z27.h, z25.h\n"
+      ".inst 0x6478e7af  // bfmmla z15.s, z29.h, z24.h\n"
+      ".inst 0x6478e777  // bfmmla z23.s, z27.h, z24.h\n"
       "50:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -942,41 +942,41 @@
       "uzp2 z19.d, z19.d, z23.d\n"
       "tbz %x[flags], #1, 51f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
+      "ld1rw { z24.s }, p5/Z, [x20]\n"
       "add x20, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z6.s, p5/M, z6.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z1.s\n"
-      "fmin z22.s, p5/M, z22.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmax z6.s, p5/M, z6.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z0.s\n"
-      "fmax z22.s, p5/M, z22.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "ld1rw { z23.s }, p5/Z, [x20]\n"
+      "fmin z6.s, p5/M, z6.s, z24.s\n"
+      "fmin z12.s, p5/M, z12.s, z24.s\n"
+      "fmin z13.s, p5/M, z13.s, z24.s\n"
+      "fmin z14.s, p5/M, z14.s, z24.s\n"
+      "fmin z8.s, p5/M, z8.s, z24.s\n"
+      "fmin z9.s, p5/M, z9.s, z24.s\n"
+      "fmin z10.s, p5/M, z10.s, z24.s\n"
+      "fmin z11.s, p5/M, z11.s, z24.s\n"
+      "fmin z15.s, p5/M, z15.s, z24.s\n"
+      "fmin z20.s, p5/M, z20.s, z24.s\n"
+      "fmin z21.s, p5/M, z21.s, z24.s\n"
+      "fmin z22.s, p5/M, z22.s, z24.s\n"
+      "fmin z16.s, p5/M, z16.s, z24.s\n"
+      "fmin z17.s, p5/M, z17.s, z24.s\n"
+      "fmin z18.s, p5/M, z18.s, z24.s\n"
+      "fmin z19.s, p5/M, z19.s, z24.s\n"
+      "fmax z6.s, p5/M, z6.s, z23.s\n"
+      "fmax z12.s, p5/M, z12.s, z23.s\n"
+      "fmax z13.s, p5/M, z13.s, z23.s\n"
+      "fmax z14.s, p5/M, z14.s, z23.s\n"
+      "fmax z8.s, p5/M, z8.s, z23.s\n"
+      "fmax z9.s, p5/M, z9.s, z23.s\n"
+      "fmax z10.s, p5/M, z10.s, z23.s\n"
+      "fmax z11.s, p5/M, z11.s, z23.s\n"
+      "fmax z15.s, p5/M, z15.s, z23.s\n"
+      "fmax z20.s, p5/M, z20.s, z23.s\n"
+      "fmax z21.s, p5/M, z21.s, z23.s\n"
+      "fmax z22.s, p5/M, z22.s, z23.s\n"
+      "fmax z16.s, p5/M, z16.s, z23.s\n"
+      "fmax z17.s, p5/M, z17.s, z23.s\n"
+      "fmax z18.s, p5/M, z18.s, z23.s\n"
+      "fmax z19.s, p5/M, z19.s, z23.s\n"
       "51:"  // Height 4: No activation
       "st1w { z6.s }, p4, [x9]\n"
       "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -1048,54 +1048,54 @@
       "55:"  // Height 5: no bias
       "tbz %x[flags], #0, 56f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x24]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip1 z17.d, z18.d, z21.d\n"
       "zip2 z21.d, z18.d, z21.d\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z25.s }, p4/Z, [x22]\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x20]\n"
       "zip1 z18.d, z19.d, z22.d\n"
       "zip2 z22.d, z19.d, z22.d\n"
-      "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z19.d, z24.d, z23.d\n"
       "zip2 z23.d, z24.d, z23.d\n"
-      "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z24.d, z25.d, z28.d\n"
       "zip2 z28.d, z25.d, z28.d\n"
       "zip1 z25.d, z26.d, z29.d\n"
       "zip2 z29.d, z26.d, z29.d\n"
       "zip1 z26.d, z27.d, z30.d\n"
       "zip2 z30.d, z27.d, z30.d\n"
-      "zip1 z27.d, z6.d, z31.d\n"
-      "zip2 z31.d, z6.d, z31.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
       "b 57f\n"
       "56:"  // Height 5: no accumulate
       "mov z8.b, #0x0\n"
@@ -1127,15 +1127,15 @@
       "58:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 59f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 60f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -1146,125 +1146,125 @@
       "b 60f\n"
       "59:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
       "60:"  // Height 5: input setup done
       "cmp x27, #0x4\n"
       "ble 62f\n"
       "61:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
-      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x24]\n"
+      "ld1rqw { z6.s }, p0/Z, [x26]\n"
+      "ld1rqw { z5.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab4c6  // bfcvt z6.h, p5/M, z6.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x24]\n"
       "ld1rqw { z3.s }, p0/Z, [x23]\n"
-      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
-      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
-      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      ".inst 0x658ab4a5  // bfcvt z5.h, p5/M, z5.s\n"
+      ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x22]\n"
       ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
       "sub x27, x27, #0x4\n"
       "uzp1 z3.h, z3.h, z3.h\n"
-      ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
       "cmp x27, #0x4\n"
       "add x26, x26, #0x10\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      "trn1 z2.d, z2.d, z3.d\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "trn1 z6.d, z6.d, z5.d\n"
+      "trn1 z4.d, z4.d, z3.d\n"
+      ".inst 0x6461e4c8  // bfmmla z8.s, z6.h, z1.h\n"
       "add x25, x25, #0x10\n"
-      "uzp1 z4.h, z4.h, z4.h\n"
-      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
-      ".inst 0x6466e498  // bfmmla z24.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
-      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      ".inst 0x6461e490  // bfmmla z16.s, z4.h, z1.h\n"
+      ".inst 0x6461e458  // bfmmla z24.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460e4cc  // bfmmla z12.s, z6.h, z0.h\n"
+      ".inst 0x6460e494  // bfmmla z20.s, z4.h, z0.h\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6467e49c  // bfmmla z28.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6460e45c  // bfmmla z28.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461e4c9  // bfmmla z9.s, z6.h, z1.h\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
-      ".inst 0x6466e499  // bfmmla z25.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
-      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
-      ".inst 0x6467e49d  // bfmmla z29.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
-      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
-      ".inst 0x6466e49a  // bfmmla z26.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
-      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
-      ".inst 0x6467e49e  // bfmmla z30.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      ".inst 0x6461e491  // bfmmla z17.s, z4.h, z1.h\n"
+      ".inst 0x6461e459  // bfmmla z25.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e4cd  // bfmmla z13.s, z6.h, z0.h\n"
+      ".inst 0x6460e495  // bfmmla z21.s, z4.h, z0.h\n"
+      ".inst 0x6460e45d  // bfmmla z29.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6461e4ca  // bfmmla z10.s, z6.h, z1.h\n"
+      ".inst 0x6461e492  // bfmmla z18.s, z4.h, z1.h\n"
+      ".inst 0x6461e45a  // bfmmla z26.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4ce  // bfmmla z14.s, z6.h, z0.h\n"
+      ".inst 0x6460e496  // bfmmla z22.s, z4.h, z0.h\n"
+      ".inst 0x6460e45e  // bfmmla z30.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6461e4cb  // bfmmla z11.s, z6.h, z1.h\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
-      ".inst 0x6466e49b  // bfmmla z27.s, z4.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
-      ".inst 0x6467e49f  // bfmmla z31.s, z4.h, z7.h\n"
+      ".inst 0x6461e493  // bfmmla z19.s, z4.h, z1.h\n"
+      ".inst 0x6461e45b  // bfmmla z27.s, z2.h, z1.h\n"
+      ".inst 0x6460e4cf  // bfmmla z15.s, z6.h, z0.h\n"
+      ".inst 0x6460e497  // bfmmla z23.s, z4.h, z0.h\n"
+      ".inst 0x6460e45f  // bfmmla z31.s, z2.h, z0.h\n"
       "bgt 61b\n"
       "62:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
-      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x24]\n"
+      "ld1rqw { z6.s }, p0/Z, [x26]\n"
+      "ld1rqw { z5.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab4c6  // bfcvt z6.h, p5/M, z6.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x24]\n"
       "ld1rqw { z3.s }, p0/Z, [x23]\n"
-      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
-      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
-      "ld1rqw { z4.s }, p0/Z, [x22]\n"
-      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "uzp1 z3.h, z3.h, z3.h\n"
+      ".inst 0x658ab4a5  // bfcvt z5.h, p5/M, z5.s\n"
       ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      "trn1 z2.d, z2.d, z3.d\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "ld1rqw { z2.s }, p0/Z, [x22]\n"
+      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
       "uzp1 z4.h, z4.h, z4.h\n"
-      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
-      ".inst 0x6466e498  // bfmmla z24.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
-      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
-      ".inst 0x6467e49c  // bfmmla z28.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
-      ".inst 0x6466e499  // bfmmla z25.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
-      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
-      ".inst 0x6467e49d  // bfmmla z29.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
-      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
-      ".inst 0x6466e49a  // bfmmla z26.s, z4.h, z6.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
-      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
-      ".inst 0x6467e49e  // bfmmla z30.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      "trn1 z6.d, z6.d, z5.d\n"
+      "trn1 z4.d, z4.d, z3.d\n"
+      ".inst 0x6461e4c8  // bfmmla z8.s, z6.h, z1.h\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      ".inst 0x6461e490  // bfmmla z16.s, z4.h, z1.h\n"
+      ".inst 0x6461e458  // bfmmla z24.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460e4cc  // bfmmla z12.s, z6.h, z0.h\n"
+      ".inst 0x6460e494  // bfmmla z20.s, z4.h, z0.h\n"
+      ".inst 0x6460e45c  // bfmmla z28.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461e4c9  // bfmmla z9.s, z6.h, z1.h\n"
+      ".inst 0x6461e491  // bfmmla z17.s, z4.h, z1.h\n"
+      ".inst 0x6461e459  // bfmmla z25.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e4cd  // bfmmla z13.s, z6.h, z0.h\n"
+      ".inst 0x6460e495  // bfmmla z21.s, z4.h, z0.h\n"
+      ".inst 0x6460e45d  // bfmmla z29.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6461e4ca  // bfmmla z10.s, z6.h, z1.h\n"
+      ".inst 0x6461e492  // bfmmla z18.s, z4.h, z1.h\n"
+      ".inst 0x6461e45a  // bfmmla z26.s, z2.h, z1.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4ce  // bfmmla z14.s, z6.h, z0.h\n"
+      ".inst 0x6460e496  // bfmmla z22.s, z4.h, z0.h\n"
+      ".inst 0x6460e45e  // bfmmla z30.s, z2.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6461e4cb  // bfmmla z11.s, z6.h, z1.h\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
-      ".inst 0x6466e49b  // bfmmla z27.s, z4.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
-      ".inst 0x6467e49f  // bfmmla z31.s, z4.h, z7.h\n"
+      ".inst 0x6461e493  // bfmmla z19.s, z4.h, z1.h\n"
+      ".inst 0x6461e45b  // bfmmla z27.s, z2.h, z1.h\n"
+      ".inst 0x6460e4cf  // bfmmla z15.s, z6.h, z0.h\n"
+      ".inst 0x6460e497  // bfmmla z23.s, z4.h, z0.h\n"
+      ".inst 0x6460e45f  // bfmmla z31.s, z2.h, z0.h\n"
       "63:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1297,49 +1297,49 @@
       "uzp1 z27.d, z27.d, z31.d\n"
       "tbz %x[flags], #1, 64f\n"
       "add x20, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z1.s }, p5/Z, [x20]\n"
-      "add x20, %x[args_ptr], %[offset_min]\n"
       "ld1rw { z0.s }, p5/Z, [x20]\n"
-      "fmin z6.s, p5/M, z6.s, z1.s\n"
-      "fmin z12.s, p5/M, z12.s, z1.s\n"
-      "fmin z13.s, p5/M, z13.s, z1.s\n"
-      "fmin z14.s, p5/M, z14.s, z1.s\n"
-      "fmin z8.s, p5/M, z8.s, z1.s\n"
-      "fmin z9.s, p5/M, z9.s, z1.s\n"
-      "fmin z10.s, p5/M, z10.s, z1.s\n"
-      "fmin z11.s, p5/M, z11.s, z1.s\n"
-      "fmin z15.s, p5/M, z15.s, z1.s\n"
-      "fmin z20.s, p5/M, z20.s, z1.s\n"
-      "fmin z21.s, p5/M, z21.s, z1.s\n"
-      "fmin z22.s, p5/M, z22.s, z1.s\n"
-      "fmin z16.s, p5/M, z16.s, z1.s\n"
-      "fmin z17.s, p5/M, z17.s, z1.s\n"
-      "fmin z18.s, p5/M, z18.s, z1.s\n"
-      "fmin z19.s, p5/M, z19.s, z1.s\n"
-      "fmin z24.s, p5/M, z24.s, z1.s\n"
-      "fmin z25.s, p5/M, z25.s, z1.s\n"
-      "fmin z26.s, p5/M, z26.s, z1.s\n"
-      "fmin z27.s, p5/M, z27.s, z1.s\n"
-      "fmax z6.s, p5/M, z6.s, z0.s\n"
-      "fmax z12.s, p5/M, z12.s, z0.s\n"
-      "fmax z13.s, p5/M, z13.s, z0.s\n"
-      "fmax z14.s, p5/M, z14.s, z0.s\n"
-      "fmax z8.s, p5/M, z8.s, z0.s\n"
-      "fmax z9.s, p5/M, z9.s, z0.s\n"
-      "fmax z10.s, p5/M, z10.s, z0.s\n"
-      "fmax z11.s, p5/M, z11.s, z0.s\n"
-      "fmax z15.s, p5/M, z15.s, z0.s\n"
-      "fmax z20.s, p5/M, z20.s, z0.s\n"
-      "fmax z21.s, p5/M, z21.s, z0.s\n"
-      "fmax z22.s, p5/M, z22.s, z0.s\n"
-      "fmax z16.s, p5/M, z16.s, z0.s\n"
-      "fmax z17.s, p5/M, z17.s, z0.s\n"
-      "fmax z18.s, p5/M, z18.s, z0.s\n"
-      "fmax z19.s, p5/M, z19.s, z0.s\n"
-      "fmax z24.s, p5/M, z24.s, z0.s\n"
-      "fmax z25.s, p5/M, z25.s, z0.s\n"
-      "fmax z26.s, p5/M, z26.s, z0.s\n"
-      "fmax z27.s, p5/M, z27.s, z0.s\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z23.s }, p5/Z, [x20]\n"
+      "fmin z6.s, p5/M, z6.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z6.s, p5/M, z6.s, z23.s\n"
+      "fmax z12.s, p5/M, z12.s, z23.s\n"
+      "fmax z13.s, p5/M, z13.s, z23.s\n"
+      "fmax z14.s, p5/M, z14.s, z23.s\n"
+      "fmax z8.s, p5/M, z8.s, z23.s\n"
+      "fmax z9.s, p5/M, z9.s, z23.s\n"
+      "fmax z10.s, p5/M, z10.s, z23.s\n"
+      "fmax z11.s, p5/M, z11.s, z23.s\n"
+      "fmax z15.s, p5/M, z15.s, z23.s\n"
+      "fmax z20.s, p5/M, z20.s, z23.s\n"
+      "fmax z21.s, p5/M, z21.s, z23.s\n"
+      "fmax z22.s, p5/M, z22.s, z23.s\n"
+      "fmax z16.s, p5/M, z16.s, z23.s\n"
+      "fmax z17.s, p5/M, z17.s, z23.s\n"
+      "fmax z18.s, p5/M, z18.s, z23.s\n"
+      "fmax z19.s, p5/M, z19.s, z23.s\n"
+      "fmax z24.s, p5/M, z24.s, z23.s\n"
+      "fmax z25.s, p5/M, z25.s, z23.s\n"
+      "fmax z26.s, p5/M, z26.s, z23.s\n"
+      "fmax z27.s, p5/M, z27.s, z23.s\n"
       "64:"  // Height 5: No activation
       "st1w { z6.s }, p4, [x9]\n"
       "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -1418,59 +1418,59 @@
       "68:"  // Height 6: no bias
       "tbz %x[flags], #0, 69f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x25, x9, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
+      "add x24, x9, x20, LSL #2\n"
       "add x23, x24, x20, LSL #2\n"
+      "ld1w { z17.s }, p4/Z, [x9]\n"
       "add x22, x23, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
       "add x21, x22, x20, LSL #2\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x25]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x24]\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "zip1 z8.d, z17.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "zip2 z12.d, z17.d, z12.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z20.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip2 z14.d, z20.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x23]\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
       "zip2 z15.d, z16.d, z15.d\n"
       "zip1 z16.d, z17.d, z20.d\n"
-      "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
       "zip2 z20.d, z17.d, z20.d\n"
       "zip1 z17.d, z18.d, z21.d\n"
-      "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z25.s }, p4/Z, [x22]\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x21]\n"
       "zip2 z21.d, z18.d, z21.d\n"
       "zip1 z18.d, z19.d, z22.d\n"
-      "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip2 z22.d, z19.d, z22.d\n"
       "zip1 z19.d, z24.d, z23.d\n"
-      "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p4/Z, [x21]\n"
+      "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
       "zip2 z23.d, z24.d, z23.d\n"
       "zip1 z24.d, z25.d, z28.d\n"
-      "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip2 z28.d, z25.d, z28.d\n"
       "zip1 z25.d, z26.d, z29.d\n"
-      "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip2 z29.d, z26.d, z29.d\n"
       "zip1 z26.d, z27.d, z30.d\n"
       "zip2 z30.d, z27.d, z30.d\n"
-      "zip1 z27.d, z6.d, z31.d\n"
-      "zip2 z31.d, z6.d, z31.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
       "b 70f\n"
       "69:"  // Height 6: no accumulate
       "mov z8.b, #0x0\n"
@@ -1502,16 +1502,16 @@
       "71:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 72f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 73f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20, LSL #2\n"
@@ -1523,135 +1523,135 @@
       "b 73f\n"
       "72:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20, LSL #2\n"
-      "add x24, x25, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "add x22, x23, x20, LSL #2\n"
-      "add x21, x22, x20, LSL #2\n"
+      "add x25, x26, x21, LSL #2\n"
+      "add x24, x25, x21, LSL #2\n"
+      "add x23, x24, x21, LSL #2\n"
+      "add x22, x23, x21, LSL #2\n"
+      "add x21, x22, x21, LSL #2\n"
       "73:"  // Height 6: input setup done
       "cmp x27, #0x4\n"
       "ble 75f\n"
       "74:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
-      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      "ld1rqw { z3.s }, p0/Z, [x23]\n"
-      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
-      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
-      "ld1rqw { z4.s }, p0/Z, [x22]\n"
-      "ld1rqw { z5.s }, p0/Z, [x21]\n"
-      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
-      ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
+      "ld1rqw { z7.s }, p0/Z, [x26]\n"
+      "ld1rqw { z6.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab4e7  // bfcvt z7.h, p5/M, z7.s\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z4.s }, p0/Z, [x23]\n"
+      ".inst 0x658ab4c6  // bfcvt z6.h, p5/M, z6.s\n"
       ".inst 0x658ab4a5  // bfcvt z5.h, p5/M, z5.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
+      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      "ld1rqw { z2.s }, p0/Z, [x21]\n"
+      ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
+      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      "uzp1 z7.h, z7.h, z7.h\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
       "sub x27, x27, #0x4\n"
       "cmp x27, #0x4\n"
-      "uzp1 z3.h, z3.h, z3.h\n"
       "uzp1 z4.h, z4.h, z4.h\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "uzp1 z5.h, z5.h, z5.h\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      "trn1 z7.d, z7.d, z6.d\n"
+      ".inst 0x6461e4e8  // bfmmla z8.s, z7.h, z1.h\n"
       "add x24, x24, #0x10\n"
-      "trn1 z2.d, z2.d, z3.d\n"
-      "trn1 z4.d, z4.d, z5.d\n"
-      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
+      "trn1 z5.d, z5.d, z4.d\n"
+      "trn1 z3.d, z3.d, z2.d\n"
+      ".inst 0x6461e4b0  // bfmmla z16.s, z5.h, z1.h\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x6466e498  // bfmmla z24.s, z4.h, z6.h\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6461e478  // bfmmla z24.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
-      ".inst 0x6467e49c  // bfmmla z28.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6460e4b4  // bfmmla z20.s, z5.h, z0.h\n"
+      ".inst 0x6460e47c  // bfmmla z28.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
-      ".inst 0x6466e499  // bfmmla z25.s, z4.h, z6.h\n"
-      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
-      ".inst 0x6467e49d  // bfmmla z29.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
-      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
-      ".inst 0x6466e49a  // bfmmla z26.s, z4.h, z6.h\n"
-      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
-      ".inst 0x6467e49e  // bfmmla z30.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b1  // bfmmla z17.s, z5.h, z1.h\n"
+      ".inst 0x6461e479  // bfmmla z25.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e4b5  // bfmmla z21.s, z5.h, z0.h\n"
+      ".inst 0x6460e47d  // bfmmla z29.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6461e4ea  // bfmmla z10.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b2  // bfmmla z18.s, z5.h, z1.h\n"
+      ".inst 0x6461e47a  // bfmmla z26.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4b6  // bfmmla z22.s, z5.h, z0.h\n"
+      ".inst 0x6460e47e  // bfmmla z30.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
-      ".inst 0x6466e49b  // bfmmla z27.s, z4.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
-      ".inst 0x6467e49f  // bfmmla z31.s, z4.h, z7.h\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b3  // bfmmla z19.s, z5.h, z1.h\n"
+      ".inst 0x6461e47b  // bfmmla z27.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b7  // bfmmla z23.s, z5.h, z0.h\n"
+      ".inst 0x6460e47f  // bfmmla z31.s, z3.h, z0.h\n"
       "bgt 74b\n"
       "75:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.s, XZR, x27\n"
-      "ld1rqw { z0.s }, p0/Z, [x26]\n"
-      "ld1rqw { z1.s }, p0/Z, [x25]\n"
-      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
-      "ld1rqw { z2.s }, p0/Z, [x24]\n"
-      "ld1rqw { z3.s }, p0/Z, [x23]\n"
-      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
-      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
-      "ld1rqw { z4.s }, p0/Z, [x22]\n"
-      "ld1rqw { z5.s }, p0/Z, [x21]\n"
-      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
-      ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
+      "ld1rqw { z7.s }, p0/Z, [x26]\n"
+      "ld1rqw { z6.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab4e7  // bfcvt z7.h, p5/M, z7.s\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "ld1rqw { z4.s }, p0/Z, [x23]\n"
+      ".inst 0x658ab4c6  // bfcvt z6.h, p5/M, z6.s\n"
       ".inst 0x658ab4a5  // bfcvt z5.h, p5/M, z5.s\n"
-      "uzp1 z0.h, z0.h, z0.h\n"
-      "ld1h { z6.h }, p5/Z, [x10]\n"
-      "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
-      "uzp1 z1.h, z1.h, z1.h\n"
-      "uzp1 z2.h, z2.h, z2.h\n"
-      "uzp1 z3.h, z3.h, z3.h\n"
-      "uzp1 z4.h, z4.h, z4.h\n"
+      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      "ld1rqw { z2.s }, p0/Z, [x21]\n"
+      ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
+      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      "uzp1 z7.h, z7.h, z7.h\n"
+      "ld1h { z1.h }, p5/Z, [x10]\n"
+      "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "uzp1 z6.h, z6.h, z6.h\n"
       "uzp1 z5.h, z5.h, z5.h\n"
-      "trn1 z0.d, z0.d, z1.d\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
-      "trn1 z2.d, z2.d, z3.d\n"
-      "trn1 z4.d, z4.d, z5.d\n"
-      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
-      ".inst 0x6466e498  // bfmmla z24.s, z4.h, z6.h\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
-      ".inst 0x6467e49c  // bfmmla z28.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
-      ".inst 0x6466e499  // bfmmla z25.s, z4.h, z6.h\n"
-      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
-      ".inst 0x6467e49d  // bfmmla z29.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
-      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
-      ".inst 0x6466e49a  // bfmmla z26.s, z4.h, z6.h\n"
-      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
-      "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
-      ".inst 0x6467e49e  // bfmmla z30.s, z4.h, z7.h\n"
-      "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      "trn1 z7.d, z7.d, z6.d\n"
+      ".inst 0x6461e4e8  // bfmmla z8.s, z7.h, z1.h\n"
+      "trn1 z5.d, z5.d, z4.d\n"
+      "trn1 z3.d, z3.d, z2.d\n"
+      ".inst 0x6461e4b0  // bfmmla z16.s, z5.h, z1.h\n"
+      ".inst 0x6461e478  // bfmmla z24.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ec  // bfmmla z12.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6460e4b4  // bfmmla z20.s, z5.h, z0.h\n"
+      ".inst 0x6460e47c  // bfmmla z28.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6461e4e9  // bfmmla z9.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b1  // bfmmla z17.s, z5.h, z1.h\n"
+      ".inst 0x6461e479  // bfmmla z25.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6460e4b5  // bfmmla z21.s, z5.h, z0.h\n"
+      ".inst 0x6460e47d  // bfmmla z29.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6461e4ea  // bfmmla z10.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b2  // bfmmla z18.s, z5.h, z1.h\n"
+      ".inst 0x6461e47a  // bfmmla z26.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ee  // bfmmla z14.s, z7.h, z0.h\n"
+      "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6460e4b6  // bfmmla z22.s, z5.h, z0.h\n"
+      ".inst 0x6460e47e  // bfmmla z30.s, z3.h, z0.h\n"
+      "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
-      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
-      ".inst 0x6466e49b  // bfmmla z27.s, z4.h, z6.h\n"
-      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
-      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
-      ".inst 0x6467e49f  // bfmmla z31.s, z4.h, z7.h\n"
+      ".inst 0x6461e4eb  // bfmmla z11.s, z7.h, z1.h\n"
+      ".inst 0x6461e4b3  // bfmmla z19.s, z5.h, z1.h\n"
+      ".inst 0x6461e47b  // bfmmla z27.s, z3.h, z1.h\n"
+      ".inst 0x6460e4ef  // bfmmla z15.s, z7.h, z0.h\n"
+      ".inst 0x6460e4b7  // bfmmla z23.s, z5.h, z0.h\n"
+      ".inst 0x6460e47f  // bfmmla z31.s, z3.h, z0.h\n"
       "76:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1782,7 +1782,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "80:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1790,4 +1789,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
index c8a7d66..ffc1606 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -74,7 +74,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, int8_t>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -97,5 +96,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
index 562b275..b7c5234 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
@@ -104,11 +104,11 @@
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
       "cbnz x26, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -121,39 +121,39 @@
       "7:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x25\n"
       "ld1rqb { z0.b }, p0/Z, [x24]\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "sdot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28]\n"
+      "sdot z16.s, z20.b, z0.b[0]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "sdot z17.s, z21.b, z0.b[0]\n"
+      "sdot z18.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z19.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "sdot z16.s, z20.b, z0.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "sdot z17.s, z21.b, z0.b[1]\n"
+      "sdot z18.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      "sdot z19.s, z4.b, z0.b[1]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "sdot z17.s, z6.b, z0.b[2]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
-      "sdot z19.s, z8.b, z0.b[2]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
+      "sdot z19.s, z20.b, z0.b[1]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "sdot z16.s, z22.b, z0.b[2]\n"
+      "sdot z17.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      "sdot z18.s, z21.b, z0.b[2]\n"
+      "sdot z19.s, z20.b, z0.b[2]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "sdot z16.s, z22.b, z0.b[3]\n"
+      "sdot z17.s, z20.b, z0.b[3]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "sdot z18.s, z21.b, z0.b[3]\n"
+      "sdot z19.s, z20.b, z0.b[3]\n"
       "add x24, x24, #0x10\n"
       "tbnz %x[flags], #31, 8f\n"
       "sdot z11.s, z0.b, z15.b\n"
@@ -164,47 +164,47 @@
       "9:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x25\n"
       "ld1rqb { z0.b }, p0/Z, [x24]\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
+      "ld1b { z22.b }, p2/Z, [x28]\n"
       "subs x25, x25, #0x4\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "sdot z16.s, z22.b, z0.b[0]\n"
+      "sdot z17.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z18.s, z21.b, z0.b[0]\n"
+      "sdot z19.s, z20.b, z0.b[0]\n"
       "addvl x28, x28, #4\n"
       "ble 10f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x4\n"
-      "sdot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
-      "sdot z19.s, z4.b, z0.b[1]\n"
+      "sdot z16.s, z20.b, z0.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z17.s, z22.b, z0.b[1]\n"
+      "sdot z18.s, z21.b, z0.b[1]\n"
+      "sdot z19.s, z20.b, z0.b[1]\n"
       "addvl x28, x28, #4\n"
       "ble 10f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x4\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z17.s, z6.b, z0.b[2]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
-      "sdot z19.s, z8.b, z0.b[2]\n"
+      "sdot z16.s, z20.b, z0.b[2]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z17.s, z22.b, z0.b[2]\n"
+      "sdot z18.s, z21.b, z0.b[2]\n"
+      "sdot z19.s, z20.b, z0.b[2]\n"
       "addvl x28, x28, #4\n"
       "ble 10f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
+      "ld1b { z21.b }, p2/Z, [x28]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z21.b, z0.b[3]\n"
+      "sdot z17.s, z20.b, z0.b[3]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z18.s, z21.b, z0.b[3]\n"
+      "sdot z19.s, z20.b, z0.b[3]\n"
       "addvl x28, x28, #4\n"
       "10:"  // Height 1: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 11f\n"
@@ -218,71 +218,71 @@
       "mov x20, #0x4\n"
       "whilelt p0.s, XZR, x20\n"
       "add x20, %x[qp], %[b_offset]\n"
-      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
       "saddv d11, p0, z11.s\n"
       "mov z11.s, z11.s[0]\n"
-      "neg z1.s, p2/M, z1.s\n"
-      "mul z11.s, p2/M, z11.s, z1.s\n"
+      "neg z20.s, p2/M, z20.s\n"
+      "mul z11.s, p2/M, z11.s, z20.s\n"
       "12:"  // Height 1: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z23.s }, p2/Z, [x10]\n"
+      "ld1w { z22.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "add z16.s, z16.s, z23.s\n"
+      "add z17.s, z17.s, z22.s\n"
+      "add z18.s, z18.s, z21.s\n"
+      "add z19.s, z19.s, z20.s\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
       "ld1rw { z0.s }, p2/Z, [x20]\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04b47610  // sqrdmulh z16.s, z16.s, z20.s\n"
+      ".inst 0x04b47631  // sqrdmulh z17.s, z17.s, z20.s\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04b47652  // sqrdmulh z18.s, z18.s, z20.s\n"
+      ".inst 0x04b47673  // sqrdmulh z19.s, z19.s, z20.s\n"
       "tbz %x[flags], #5, 13f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "and z7.d, z19.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
+      "and z23.d, z16.d, z0.d\n"
+      "and z22.d, z17.d, z0.d\n"
+      "and z21.d, z18.d, z0.d\n"
+      "and z20.d, z19.d, z0.d\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z23.s\n"
+      "sqadd z17.s, z17.s, z22.s\n"
+      "sqadd z18.s, z18.s, z21.s\n"
+      "sqadd z19.s, z19.s, z20.s\n"
       "13:"  // Height 1: no shift correction
       "add x20, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z16.s, z16.s, z20.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z20.s\n"
+      "add z18.s, z18.s, z20.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
       "add x20, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x20]\n"
-      "add z19.s, z19.s, z4.s\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z20.s\n"
       "add x20, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z21.s\n"
+      "smin z17.s, p2/M, z17.s, z21.s\n"
+      "smin z18.s, p2/M, z18.s, z21.s\n"
+      "smin z19.s, p2/M, z19.s, z21.s\n"
+      "smax z16.s, p2/M, z16.s, z20.s\n"
+      "smax z17.s, p2/M, z17.s, z20.s\n"
+      "smax z18.s, p2/M, z18.s, z20.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z20.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
       "st1b { z16.b }, p1, [x27]\n"
@@ -317,12 +317,12 @@
       "18:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 19f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
       "cbnz x26, 20f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -330,7 +330,7 @@
       "b 20f\n"
       "19:"  // Height 2: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
+      "add x23, x24, x21\n"
       "20:"  // Height 2: input setup done
       "cmp x25, #0x10\n"
       "ble 23f\n"
@@ -339,56 +339,56 @@
       "ld1rqb { z0.b }, p0/Z, [x24]\n"
       "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "add x24, x24, #0x10\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "sdot z21.s, z5.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "sdot z22.s, z6.b, z1.b[0]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
-      "sdot z23.s, z7.b, z1.b[0]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z24.b, z0.b[0]\n"
+      "sdot z20.s, z24.b, z1.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z17.s, z26.b, z0.b[0]\n"
+      "sdot z21.s, z26.b, z1.b[0]\n"
+      "sdot z18.s, z24.b, z0.b[0]\n"
+      "sdot z22.s, z24.b, z1.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "sdot z19.s, z25.b, z0.b[0]\n"
+      "sdot z23.s, z25.b, z1.b[0]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      "sdot z16.s, z8.b, z0.b[1]\n"
-      "sdot z20.s, z8.b, z1.b[1]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
-      "sdot z21.s, z9.b, z1.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
-      "sdot z22.s, z10.b, z1.b[1]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "sdot z19.s, z4.b, z0.b[1]\n"
-      "sdot z23.s, z4.b, z1.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "sdot z20.s, z5.b, z1.b[2]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "sdot z16.s, z24.b, z0.b[1]\n"
+      "sdot z20.s, z24.b, z1.b[1]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "sdot z17.s, z27.b, z0.b[1]\n"
+      "sdot z21.s, z27.b, z1.b[1]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "sdot z18.s, z26.b, z0.b[1]\n"
+      "sdot z22.s, z26.b, z1.b[1]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "sdot z19.s, z25.b, z0.b[1]\n"
+      "sdot z23.s, z25.b, z1.b[1]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "sdot z16.s, z24.b, z0.b[2]\n"
+      "sdot z20.s, z24.b, z1.b[2]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      "sdot z17.s, z6.b, z0.b[2]\n"
-      "sdot z21.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
-      "sdot z22.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z8.b, z0.b[2]\n"
-      "sdot z23.s, z8.b, z1.b[2]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "sdot z20.s, z9.b, z1.b[3]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "sdot z21.s, z10.b, z1.b[3]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z22.s, z4.b, z1.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
-      "sdot z23.s, z5.b, z1.b[3]\n"
+      "sdot z17.s, z30.b, z0.b[2]\n"
+      "sdot z21.s, z30.b, z1.b[2]\n"
+      "sdot z18.s, z29.b, z0.b[2]\n"
+      "sdot z22.s, z29.b, z1.b[2]\n"
+      "sdot z19.s, z28.b, z0.b[2]\n"
+      "sdot z23.s, z28.b, z1.b[2]\n"
+      "sdot z16.s, z27.b, z0.b[3]\n"
+      "sdot z20.s, z27.b, z1.b[3]\n"
+      "sdot z17.s, z26.b, z0.b[3]\n"
+      "sdot z21.s, z26.b, z1.b[3]\n"
+      "sdot z18.s, z25.b, z0.b[3]\n"
+      "sdot z22.s, z25.b, z1.b[3]\n"
+      "sdot z19.s, z24.b, z0.b[3]\n"
+      "sdot z23.s, z24.b, z1.b[3]\n"
       "tbnz %x[flags], #31, 22f\n"
       "sdot z11.s, z0.b, z15.b\n"
       "sdot z12.s, z1.b, z15.b\n"
@@ -401,63 +401,63 @@
       "ld1rqb { z0.b }, p0/Z, [x24]\n"
       "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "subs x25, x25, #0x4\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "sdot z21.s, z5.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "sdot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z24.b, z0.b[0]\n"
+      "sdot z20.s, z24.b, z1.b[0]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z17.s, z26.b, z0.b[0]\n"
+      "sdot z21.s, z26.b, z1.b[0]\n"
+      "sdot z18.s, z25.b, z0.b[0]\n"
+      "sdot z22.s, z25.b, z1.b[0]\n"
       "addvl x28, x28, #4\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
-      "sdot z23.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z24.b, z0.b[0]\n"
+      "sdot z23.s, z24.b, z1.b[0]\n"
       "ble 24f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z27.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x4\n"
-      "sdot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z20.s, z8.b, z1.b[1]\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
-      "sdot z21.s, z9.b, z1.b[1]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
+      "sdot z16.s, z27.b, z0.b[1]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z20.s, z27.b, z1.b[1]\n"
+      "sdot z17.s, z26.b, z0.b[1]\n"
+      "sdot z21.s, z26.b, z1.b[1]\n"
+      "sdot z18.s, z25.b, z0.b[1]\n"
       "addvl x28, x28, #4\n"
-      "sdot z22.s, z10.b, z1.b[1]\n"
-      "sdot z19.s, z4.b, z0.b[1]\n"
-      "sdot z23.s, z4.b, z1.b[1]\n"
+      "sdot z22.s, z25.b, z1.b[1]\n"
+      "sdot z19.s, z24.b, z0.b[1]\n"
+      "sdot z23.s, z24.b, z1.b[1]\n"
       "ble 24f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z27.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x4\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z20.s, z5.b, z1.b[2]\n"
-      "sdot z17.s, z6.b, z0.b[2]\n"
-      "sdot z21.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
+      "sdot z16.s, z27.b, z0.b[2]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z20.s, z27.b, z1.b[2]\n"
+      "sdot z17.s, z26.b, z0.b[2]\n"
+      "sdot z21.s, z26.b, z1.b[2]\n"
+      "sdot z18.s, z25.b, z0.b[2]\n"
       "addvl x28, x28, #4\n"
-      "sdot z22.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z8.b, z0.b[2]\n"
-      "sdot z23.s, z8.b, z1.b[2]\n"
+      "sdot z22.s, z25.b, z1.b[2]\n"
+      "sdot z19.s, z24.b, z0.b[2]\n"
+      "sdot z23.s, z24.b, z1.b[2]\n"
       "ble 24f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "sdot z20.s, z9.b, z1.b[3]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "sdot z21.s, z10.b, z1.b[3]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z22.s, z4.b, z1.b[3]\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z24.b, z0.b[3]\n"
+      "sdot z20.s, z24.b, z1.b[3]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z17.s, z26.b, z0.b[3]\n"
+      "sdot z21.s, z26.b, z1.b[3]\n"
+      "sdot z18.s, z25.b, z0.b[3]\n"
+      "sdot z22.s, z25.b, z1.b[3]\n"
       "addvl x28, x28, #4\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
-      "sdot z23.s, z5.b, z1.b[3]\n"
+      "sdot z19.s, z24.b, z0.b[3]\n"
+      "sdot z23.s, z24.b, z1.b[3]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 25f\n"
       "sdot z11.s, z0.b, z15.b\n"
@@ -473,120 +473,120 @@
       "mov x20, #0x4\n"
       "whilelt p0.s, XZR, x20\n"
       "add x20, %x[qp], %[b_offset]\n"
-      "ld1rw { z2.s }, p2/Z, [x20]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
       "saddv d11, p0, z11.s\n"
       "mov z11.s, z11.s[0]\n"
       "saddv d12, p0, z12.s\n"
-      "neg z2.s, p2/M, z2.s\n"
+      "neg z24.s, p2/M, z24.s\n"
       "mov z12.s, z12.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z2.s\n"
-      "mul z12.s, p2/M, z12.s, z2.s\n"
+      "mul z11.s, p2/M, z11.s, z24.s\n"
+      "mul z12.s, p2/M, z12.s, z24.s\n"
       "26:"  // Height 2: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x10]\n"
+      "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z20.s, z20.s, z12.s\n"
       "add z21.s, z21.s, z12.s\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "add z22.s, z22.s, z12.s\n"
       "add z23.s, z23.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
+      "add z16.s, z16.s, z28.s\n"
+      "add z17.s, z17.s, z27.s\n"
       "addvl x10, x10, #4\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
+      "add z18.s, z18.s, z26.s\n"
+      "add z19.s, z19.s, z25.s\n"
+      "add z20.s, z20.s, z28.s\n"
+      "add z21.s, z21.s, z27.s\n"
       "ld1rw { z0.s }, p2/Z, [x20]\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      "add z22.s, z22.s, z26.s\n"
+      "add z23.s, z23.s, z25.s\n"
+      ".inst 0x04b87610  // sqrdmulh z16.s, z16.s, z24.s\n"
+      ".inst 0x04b87631  // sqrdmulh z17.s, z17.s, z24.s\n"
+      ".inst 0x04b87652  // sqrdmulh z18.s, z18.s, z24.s\n"
+      ".inst 0x04b87673  // sqrdmulh z19.s, z19.s, z24.s\n"
+      ".inst 0x04b87694  // sqrdmulh z20.s, z20.s, z24.s\n"
+      ".inst 0x04b876b5  // sqrdmulh z21.s, z21.s, z24.s\n"
+      ".inst 0x04b876d6  // sqrdmulh z22.s, z22.s, z24.s\n"
+      ".inst 0x04b876f7  // sqrdmulh z23.s, z23.s, z24.s\n"
       "tbz %x[flags], #5, 27f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "and z7.d, z19.d, z0.d\n"
-      "and z8.d, z20.d, z0.d\n"
-      "and z9.d, z21.d, z0.d\n"
-      "and z10.d, z22.d, z0.d\n"
-      "and z4.d, z23.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "sqadd z20.s, z20.s, z8.s\n"
-      "sqadd z21.s, z21.s, z9.s\n"
-      "sqadd z22.s, z22.s, z10.s\n"
-      "sqadd z23.s, z23.s, z4.s\n"
+      "and z24.d, z16.d, z0.d\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z24.s\n"
+      "and z30.d, z17.d, z0.d\n"
+      "and z29.d, z18.d, z0.d\n"
+      "and z28.d, z19.d, z0.d\n"
+      "and z27.d, z20.d, z0.d\n"
+      "and z26.d, z21.d, z0.d\n"
+      "and z25.d, z22.d, z0.d\n"
+      "and z24.d, z23.d, z0.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z30.s\n"
+      "sqadd z18.s, z18.s, z29.s\n"
+      "sqadd z19.s, z19.s, z28.s\n"
+      "sqadd z20.s, z20.s, z27.s\n"
+      "sqadd z21.s, z21.s, z26.s\n"
+      "sqadd z22.s, z22.s, z25.s\n"
+      "sqadd z23.s, z23.s, z24.s\n"
       "27:"  // Height 2: no shift correction
       "add x20, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z16.s, z16.s, z24.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z24.s\n"
+      "add z18.s, z18.s, z24.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z20.s, z20.s, z4.s\n"
+      "add z19.s, z19.s, z24.s\n"
+      "add z20.s, z20.s, z24.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
-      "add z21.s, z21.s, z4.s\n"
-      "add z22.s, z22.s, z4.s\n"
+      "add z21.s, z21.s, z24.s\n"
+      "add z22.s, z22.s, z24.s\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
       "add x20, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x20]\n"
-      "add z23.s, z23.s, z4.s\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
+      "add z23.s, z23.s, z24.s\n"
       "add x20, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z25.s\n"
+      "smin z17.s, p2/M, z17.s, z25.s\n"
+      "smin z18.s, p2/M, z18.s, z25.s\n"
+      "smin z19.s, p2/M, z19.s, z25.s\n"
+      "smin z20.s, p2/M, z20.s, z25.s\n"
+      "smin z21.s, p2/M, z21.s, z25.s\n"
+      "smin z22.s, p2/M, z22.s, z25.s\n"
+      "smin z23.s, p2/M, z23.s, z25.s\n"
+      "smax z16.s, p2/M, z16.s, z24.s\n"
+      "smax z17.s, p2/M, z17.s, z24.s\n"
+      "smax z18.s, p2/M, z18.s, z24.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z24.s\n"
+      "smax z20.s, p2/M, z20.s, z24.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z24.s\n"
+      "smax z22.s, p2/M, z22.s, z24.s\n"
       "uzp1 z20.h, z20.h, z21.h\n"
       "st1b { z16.b }, p1, [x27]\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
+      "smax z23.s, p2/M, z23.s, z24.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
       "st1b { z20.b }, p1, [x23]\n"
       "addvl x27, x27, #1\n"
       "28:"  // Height 2: Writeback done
@@ -624,13 +624,13 @@
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
       "cbnz x26, 34f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -639,8 +639,8 @@
       "b 34f\n"
       "33:"  // Height 3: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "34:"  // Height 3: input setup done
       "cmp x25, #0x10\n"
       "ble 37f\n"
@@ -650,73 +650,73 @@
       "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "add x24, x24, #0x10\n"
       "ld1rqb { z2.b }, p0/Z, [x22]\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z24.s, z4.b, z2.b[0]\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z21.s, z5.b, z1.b[0]\n"
-      "sdot z25.s, z5.b, z2.b[0]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "sdot z22.s, z6.b, z1.b[0]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "sdot z26.s, z6.b, z2.b[0]\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x28]\n"
+      "sdot z16.s, z28.b, z0.b[0]\n"
+      "sdot z20.s, z28.b, z1.b[0]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "sdot z24.s, z28.b, z2.b[0]\n"
+      "sdot z17.s, z30.b, z0.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z21.s, z30.b, z1.b[0]\n"
+      "sdot z25.s, z30.b, z2.b[0]\n"
+      "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "sdot z18.s, z29.b, z0.b[0]\n"
+      "sdot z22.s, z29.b, z1.b[0]\n"
+      "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "sdot z26.s, z29.b, z2.b[0]\n"
+      "sdot z19.s, z28.b, z0.b[0]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      "sdot z23.s, z7.b, z1.b[0]\n"
-      "sdot z27.s, z7.b, z2.b[0]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "sdot z16.s, z8.b, z0.b[1]\n"
-      "sdot z20.s, z8.b, z1.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "sdot z23.s, z28.b, z1.b[0]\n"
+      "sdot z27.s, z28.b, z2.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "sdot z16.s, z3.b, z0.b[1]\n"
+      "sdot z20.s, z3.b, z1.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-6, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      "sdot z24.s, z8.b, z2.b[1]\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      "sdot z24.s, z3.b, z2.b[1]\n"
+      "sdot z17.s, z31.b, z0.b[1]\n"
+      "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n"
       "add x22, x22, #0x10\n"
-      "sdot z21.s, z9.b, z1.b[1]\n"
-      "sdot z25.s, z9.b, z2.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
-      "sdot z22.s, z10.b, z1.b[1]\n"
-      "sdot z26.s, z10.b, z2.b[1]\n"
-      "sdot z19.s, z4.b, z0.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "sdot z23.s, z4.b, z1.b[1]\n"
-      "sdot z27.s, z4.b, z2.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "sdot z20.s, z5.b, z1.b[2]\n"
-      "sdot z24.s, z5.b, z2.b[2]\n"
-      "sdot z17.s, z6.b, z0.b[2]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "sdot z21.s, z6.b, z1.b[2]\n"
-      "sdot z25.s, z6.b, z2.b[2]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
-      "sdot z22.s, z7.b, z1.b[2]\n"
-      "sdot z26.s, z7.b, z2.b[2]\n"
-      "sdot z19.s, z8.b, z0.b[2]\n"
-      "sdot z23.s, z8.b, z1.b[2]\n"
-      "sdot z27.s, z8.b, z2.b[2]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "sdot z20.s, z9.b, z1.b[3]\n"
-      "sdot z24.s, z9.b, z2.b[3]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "sdot z21.s, z10.b, z1.b[3]\n"
-      "sdot z25.s, z10.b, z2.b[3]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z22.s, z4.b, z1.b[3]\n"
-      "sdot z26.s, z4.b, z2.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
-      "sdot z23.s, z5.b, z1.b[3]\n"
-      "sdot z27.s, z5.b, z2.b[3]\n"
+      "sdot z21.s, z31.b, z1.b[1]\n"
+      "sdot z25.s, z31.b, z2.b[1]\n"
+      "ld1b { z31.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "sdot z18.s, z30.b, z0.b[1]\n"
+      "sdot z22.s, z30.b, z1.b[1]\n"
+      "sdot z26.s, z30.b, z2.b[1]\n"
+      "sdot z19.s, z29.b, z0.b[1]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "sdot z23.s, z29.b, z1.b[1]\n"
+      "sdot z27.s, z29.b, z2.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "sdot z16.s, z28.b, z0.b[2]\n"
+      "sdot z20.s, z28.b, z1.b[2]\n"
+      "sdot z24.s, z28.b, z2.b[2]\n"
+      "sdot z17.s, z5.b, z0.b[2]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "sdot z21.s, z5.b, z1.b[2]\n"
+      "sdot z25.s, z5.b, z2.b[2]\n"
+      "sdot z18.s, z4.b, z0.b[2]\n"
+      "sdot z22.s, z4.b, z1.b[2]\n"
+      "sdot z26.s, z4.b, z2.b[2]\n"
+      "sdot z19.s, z3.b, z0.b[2]\n"
+      "sdot z23.s, z3.b, z1.b[2]\n"
+      "sdot z27.s, z3.b, z2.b[2]\n"
+      "sdot z16.s, z31.b, z0.b[3]\n"
+      "sdot z20.s, z31.b, z1.b[3]\n"
+      "sdot z24.s, z31.b, z2.b[3]\n"
+      "sdot z17.s, z30.b, z0.b[3]\n"
+      "sdot z21.s, z30.b, z1.b[3]\n"
+      "sdot z25.s, z30.b, z2.b[3]\n"
+      "sdot z18.s, z29.b, z0.b[3]\n"
+      "sdot z22.s, z29.b, z1.b[3]\n"
+      "sdot z26.s, z29.b, z2.b[3]\n"
+      "sdot z19.s, z28.b, z0.b[3]\n"
+      "sdot z23.s, z28.b, z1.b[3]\n"
+      "sdot z27.s, z28.b, z2.b[3]\n"
       "tbnz %x[flags], #31, 36f\n"
       "sdot z11.s, z0.b, z15.b\n"
       "sdot z12.s, z1.b, z15.b\n"
@@ -731,79 +731,79 @@
       "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "subs x25, x25, #0x4\n"
       "ld1rqb { z2.b }, p0/Z, [x22]\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z24.s, z4.b, z2.b[0]\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z21.s, z5.b, z1.b[0]\n"
-      "sdot z25.s, z5.b, z2.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x28]\n"
+      "sdot z16.s, z28.b, z0.b[0]\n"
+      "sdot z20.s, z28.b, z1.b[0]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "sdot z24.s, z28.b, z2.b[0]\n"
+      "sdot z17.s, z30.b, z0.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z21.s, z30.b, z1.b[0]\n"
+      "sdot z25.s, z30.b, z2.b[0]\n"
       "addvl x28, x28, #4\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "sdot z22.s, z6.b, z1.b[0]\n"
-      "sdot z26.s, z6.b, z2.b[0]\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
-      "sdot z23.s, z7.b, z1.b[0]\n"
-      "sdot z27.s, z7.b, z2.b[0]\n"
+      "sdot z18.s, z29.b, z0.b[0]\n"
+      "sdot z22.s, z29.b, z1.b[0]\n"
+      "sdot z26.s, z29.b, z2.b[0]\n"
+      "sdot z19.s, z28.b, z0.b[0]\n"
+      "sdot z23.s, z28.b, z1.b[0]\n"
+      "sdot z27.s, z28.b, z2.b[0]\n"
       "ble 38f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z31.b }, p2/Z, [x28]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x4\n"
-      "sdot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z20.s, z8.b, z1.b[1]\n"
-      "sdot z24.s, z8.b, z2.b[1]\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
-      "sdot z21.s, z9.b, z1.b[1]\n"
+      "sdot z16.s, z31.b, z0.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z20.s, z31.b, z1.b[1]\n"
+      "sdot z24.s, z31.b, z2.b[1]\n"
+      "sdot z17.s, z30.b, z0.b[1]\n"
+      "sdot z21.s, z30.b, z1.b[1]\n"
       "addvl x28, x28, #4\n"
-      "sdot z25.s, z9.b, z2.b[1]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
-      "sdot z22.s, z10.b, z1.b[1]\n"
-      "sdot z26.s, z10.b, z2.b[1]\n"
-      "sdot z19.s, z4.b, z0.b[1]\n"
-      "sdot z23.s, z4.b, z1.b[1]\n"
-      "sdot z27.s, z4.b, z2.b[1]\n"
+      "sdot z25.s, z30.b, z2.b[1]\n"
+      "sdot z18.s, z29.b, z0.b[1]\n"
+      "sdot z22.s, z29.b, z1.b[1]\n"
+      "sdot z26.s, z29.b, z2.b[1]\n"
+      "sdot z19.s, z28.b, z0.b[1]\n"
+      "sdot z23.s, z28.b, z1.b[1]\n"
+      "sdot z27.s, z28.b, z2.b[1]\n"
       "ble 38f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z31.b }, p2/Z, [x28]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x4\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z20.s, z5.b, z1.b[2]\n"
-      "sdot z24.s, z5.b, z2.b[2]\n"
-      "sdot z17.s, z6.b, z0.b[2]\n"
-      "sdot z21.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z31.b, z0.b[2]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z20.s, z31.b, z1.b[2]\n"
+      "sdot z24.s, z31.b, z2.b[2]\n"
+      "sdot z17.s, z30.b, z0.b[2]\n"
+      "sdot z21.s, z30.b, z1.b[2]\n"
       "addvl x28, x28, #4\n"
-      "sdot z25.s, z6.b, z2.b[2]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
-      "sdot z22.s, z7.b, z1.b[2]\n"
-      "sdot z26.s, z7.b, z2.b[2]\n"
-      "sdot z19.s, z8.b, z0.b[2]\n"
-      "sdot z23.s, z8.b, z1.b[2]\n"
-      "sdot z27.s, z8.b, z2.b[2]\n"
+      "sdot z25.s, z30.b, z2.b[2]\n"
+      "sdot z18.s, z29.b, z0.b[2]\n"
+      "sdot z22.s, z29.b, z1.b[2]\n"
+      "sdot z26.s, z29.b, z2.b[2]\n"
+      "sdot z19.s, z28.b, z0.b[2]\n"
+      "sdot z23.s, z28.b, z1.b[2]\n"
+      "sdot z27.s, z28.b, z2.b[2]\n"
       "ble 38f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "sdot z20.s, z9.b, z1.b[3]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z24.s, z9.b, z2.b[3]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "sdot z21.s, z10.b, z1.b[3]\n"
-      "sdot z25.s, z10.b, z2.b[3]\n"
+      "ld1b { z31.b }, p2/Z, [x28]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z31.b, z0.b[3]\n"
+      "sdot z20.s, z31.b, z1.b[3]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z24.s, z31.b, z2.b[3]\n"
+      "sdot z17.s, z30.b, z0.b[3]\n"
+      "sdot z21.s, z30.b, z1.b[3]\n"
+      "sdot z25.s, z30.b, z2.b[3]\n"
       "addvl x28, x28, #4\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z22.s, z4.b, z1.b[3]\n"
-      "sdot z26.s, z4.b, z2.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
-      "sdot z23.s, z5.b, z1.b[3]\n"
-      "sdot z27.s, z5.b, z2.b[3]\n"
+      "sdot z18.s, z29.b, z0.b[3]\n"
+      "sdot z22.s, z29.b, z1.b[3]\n"
+      "sdot z26.s, z29.b, z2.b[3]\n"
+      "sdot z19.s, z28.b, z0.b[3]\n"
+      "sdot z23.s, z28.b, z1.b[3]\n"
+      "sdot z27.s, z28.b, z2.b[3]\n"
       "38:"  // Height 3: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 39f\n"
       "sdot z11.s, z0.b, z15.b\n"
@@ -821,33 +821,33 @@
       "mov x20, #0x4\n"
       "whilelt p0.s, XZR, x20\n"
       "add x20, %x[qp], %[b_offset]\n"
-      "ld1rw { z3.s }, p2/Z, [x20]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
       "saddv d11, p0, z11.s\n"
       "mov z11.s, z11.s[0]\n"
       "saddv d12, p0, z12.s\n"
       "saddv d13, p0, z13.s\n"
       "mov z12.s, z12.s[0]\n"
       "mov z13.s, z13.s[0]\n"
-      "neg z3.s, p2/M, z3.s\n"
-      "mul z11.s, p2/M, z11.s, z3.s\n"
-      "mul z12.s, p2/M, z12.s, z3.s\n"
-      "mul z13.s, p2/M, z13.s, z3.s\n"
+      "neg z28.s, p2/M, z28.s\n"
+      "mul z11.s, p2/M, z11.s, z28.s\n"
+      "mul z12.s, p2/M, z12.s, z28.s\n"
+      "mul z13.s, p2/M, z13.s, z28.s\n"
       "40:"  // Height 3: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
       "add z17.s, z17.s, z11.s\n"
       "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z31.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z29.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z20.s, z20.s, z12.s\n"
       "add z21.s, z21.s, z12.s\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "add z22.s, z22.s, z12.s\n"
       "add z23.s, z23.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add z24.s, z24.s, z13.s\n"
       "add z25.s, z25.s, z13.s\n"
@@ -855,133 +855,133 @@
       "add z26.s, z26.s, z13.s\n"
       "add z27.s, z27.s, z13.s\n"
       "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
+      "add z17.s, z17.s, z31.s\n"
+      "add z18.s, z18.s, z30.s\n"
+      "add z19.s, z19.s, z29.s\n"
       "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
+      "add z21.s, z21.s, z31.s\n"
+      "add z22.s, z22.s, z30.s\n"
+      "add z23.s, z23.s, z29.s\n"
       "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
+      "add z25.s, z25.s, z31.s\n"
       "ld1rw { z0.s }, p2/Z, [x20]\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
-      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
-      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
-      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
-      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      "add z26.s, z26.s, z30.s\n"
+      "add z27.s, z27.s, z29.s\n"
+      ".inst 0x04bc7610  // sqrdmulh z16.s, z16.s, z28.s\n"
+      ".inst 0x04bc7631  // sqrdmulh z17.s, z17.s, z28.s\n"
+      ".inst 0x04bc7652  // sqrdmulh z18.s, z18.s, z28.s\n"
+      ".inst 0x04bc7673  // sqrdmulh z19.s, z19.s, z28.s\n"
+      ".inst 0x04bc7694  // sqrdmulh z20.s, z20.s, z28.s\n"
+      ".inst 0x04bc76b5  // sqrdmulh z21.s, z21.s, z28.s\n"
+      ".inst 0x04bc76d6  // sqrdmulh z22.s, z22.s, z28.s\n"
+      ".inst 0x04bc76f7  // sqrdmulh z23.s, z23.s, z28.s\n"
+      ".inst 0x04bc7718  // sqrdmulh z24.s, z24.s, z28.s\n"
+      ".inst 0x04bc7739  // sqrdmulh z25.s, z25.s, z28.s\n"
+      ".inst 0x04bc775a  // sqrdmulh z26.s, z26.s, z28.s\n"
+      ".inst 0x04bc777b  // sqrdmulh z27.s, z27.s, z28.s\n"
       "tbz %x[flags], #5, 41f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "and z7.d, z19.d, z0.d\n"
-      "and z8.d, z20.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "sqadd z20.s, z20.s, z8.s\n"
-      "and z9.d, z21.d, z0.d\n"
-      "and z10.d, z22.d, z0.d\n"
-      "and z4.d, z23.d, z0.d\n"
-      "and z5.d, z24.d, z0.d\n"
-      "and z6.d, z25.d, z0.d\n"
-      "and z7.d, z26.d, z0.d\n"
-      "and z8.d, z27.d, z0.d\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "sqadd z21.s, z21.s, z9.s\n"
-      "sqadd z22.s, z22.s, z10.s\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "sqadd z24.s, z24.s, z5.s\n"
-      "sqadd z25.s, z25.s, z6.s\n"
-      "sqadd z26.s, z26.s, z7.s\n"
-      "sqadd z27.s, z27.s, z8.s\n"
+      "and z1.d, z16.d, z0.d\n"
+      "and z31.d, z17.d, z0.d\n"
+      "and z30.d, z18.d, z0.d\n"
+      "and z29.d, z19.d, z0.d\n"
+      "and z28.d, z20.d, z0.d\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z1.s\n"
+      "sqadd z17.s, z17.s, z31.s\n"
+      "sqadd z18.s, z18.s, z30.s\n"
+      "sqadd z19.s, z19.s, z29.s\n"
+      "sqadd z20.s, z20.s, z28.s\n"
+      "and z3.d, z21.d, z0.d\n"
+      "and z2.d, z22.d, z0.d\n"
+      "and z1.d, z23.d, z0.d\n"
+      "and z31.d, z24.d, z0.d\n"
+      "and z30.d, z25.d, z0.d\n"
+      "and z29.d, z26.d, z0.d\n"
+      "and z28.d, z27.d, z0.d\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z21.s, z21.s, z3.s\n"
+      "sqadd z22.s, z22.s, z2.s\n"
+      "sqadd z23.s, z23.s, z1.s\n"
+      "sqadd z24.s, z24.s, z31.s\n"
+      "sqadd z25.s, z25.s, z30.s\n"
+      "sqadd z26.s, z26.s, z29.s\n"
+      "sqadd z27.s, z27.s, z28.s\n"
       "41:"  // Height 3: no shift correction
       "add x20, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z16.s, z16.s, z28.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z28.s\n"
+      "add z18.s, z18.s, z28.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z20.s, z20.s, z4.s\n"
+      "add z19.s, z19.s, z28.s\n"
+      "add z20.s, z20.s, z28.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
-      "add z21.s, z21.s, z4.s\n"
-      "add z22.s, z22.s, z4.s\n"
+      "add z21.s, z21.s, z28.s\n"
+      "add z22.s, z22.s, z28.s\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      "add z23.s, z23.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
+      "add z23.s, z23.s, z28.s\n"
+      "add z24.s, z24.s, z28.s\n"
       ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
       ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "add z26.s, z26.s, z4.s\n"
+      "add z25.s, z25.s, z28.s\n"
+      "add z26.s, z26.s, z28.s\n"
       ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
       "add x20, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x20]\n"
-      "add z27.s, z27.s, z4.s\n"
+      "ld1rw { z29.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z28.s\n"
       "add x20, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z29.s\n"
+      "smin z17.s, p2/M, z17.s, z29.s\n"
+      "smin z18.s, p2/M, z18.s, z29.s\n"
+      "smin z19.s, p2/M, z19.s, z29.s\n"
+      "smin z20.s, p2/M, z20.s, z29.s\n"
+      "smin z21.s, p2/M, z21.s, z29.s\n"
+      "smin z22.s, p2/M, z22.s, z29.s\n"
+      "smin z23.s, p2/M, z23.s, z29.s\n"
+      "smin z24.s, p2/M, z24.s, z29.s\n"
+      "smin z25.s, p2/M, z25.s, z29.s\n"
+      "smin z26.s, p2/M, z26.s, z29.s\n"
+      "smin z27.s, p2/M, z27.s, z29.s\n"
+      "smax z16.s, p2/M, z16.s, z28.s\n"
+      "smax z17.s, p2/M, z17.s, z28.s\n"
+      "smax z18.s, p2/M, z18.s, z28.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z28.s\n"
+      "smax z20.s, p2/M, z20.s, z28.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z28.s\n"
+      "smax z22.s, p2/M, z22.s, z28.s\n"
       "uzp1 z20.h, z20.h, z21.h\n"
       "st1b { z16.b }, p1, [x27]\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z28.s\n"
+      "smax z24.s, p2/M, z24.s, z28.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z28.s\n"
+      "smax z26.s, p2/M, z26.s, z28.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
       "st1b { z20.b }, p1, [x23]\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
+      "smax z27.s, p2/M, z27.s, z28.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
       "st1b { z24.b }, p1, [x22]\n"
       "addvl x27, x27, #1\n"
       "42:"  // Height 3: Writeback done
@@ -1027,14 +1027,14 @@
       "46:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 47f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
       "cbnz x26, 48f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -1044,9 +1044,9 @@
       "b 48f\n"
       "47:"  // Height 4: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "48:"  // Height 4: input setup done
       "cmp x25, #0x10\n"
       "ble 51f\n"
@@ -1059,88 +1059,88 @@
       "ld1rqb { z3.b }, p0/Z, [x21]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z24.s, z4.b, z2.b[0]\n"
-      "sdot z28.s, z4.b, z3.b[0]\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "sdot z21.s, z5.b, z1.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z5.b, z0.b[0]\n"
+      "sdot z20.s, z5.b, z1.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z24.s, z5.b, z2.b[0]\n"
+      "sdot z28.s, z5.b, z3.b[0]\n"
+      "sdot z17.s, z4.b, z0.b[0]\n"
+      "sdot z21.s, z4.b, z1.b[0]\n"
       "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "sdot z25.s, z5.b, z2.b[0]\n"
-      "sdot z29.s, z5.b, z3.b[0]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "sdot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "sdot z25.s, z4.b, z2.b[0]\n"
+      "sdot z29.s, z4.b, z3.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "sdot z18.s, z10.b, z0.b[0]\n"
+      "sdot z22.s, z10.b, z1.b[0]\n"
       "addvl x28, x28, #16\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "sdot z26.s, z6.b, z2.b[0]\n"
-      "sdot z30.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "sdot z26.s, z10.b, z2.b[0]\n"
+      "sdot z30.s, z10.b, z3.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
-      "sdot z23.s, z7.b, z1.b[0]\n"
-      "sdot z27.s, z7.b, z2.b[0]\n"
-      "sdot z31.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "sdot z19.s, z9.b, z0.b[0]\n"
+      "sdot z23.s, z9.b, z1.b[0]\n"
+      "sdot z27.s, z9.b, z2.b[0]\n"
+      "sdot z31.s, z9.b, z3.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
       "sdot z16.s, z8.b, z0.b[1]\n"
       "sdot z20.s, z8.b, z1.b[1]\n"
       "sdot z24.s, z8.b, z2.b[1]\n"
       "sdot z28.s, z8.b, z3.b[1]\n"
       "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
-      "sdot z21.s, z9.b, z1.b[1]\n"
-      "sdot z25.s, z9.b, z2.b[1]\n"
-      "sdot z29.s, z9.b, z3.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
-      "sdot z22.s, z10.b, z1.b[1]\n"
-      "sdot z26.s, z10.b, z2.b[1]\n"
-      "sdot z30.s, z10.b, z3.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "sdot z19.s, z4.b, z0.b[1]\n"
-      "sdot z23.s, z4.b, z1.b[1]\n"
-      "sdot z27.s, z4.b, z2.b[1]\n"
-      "sdot z31.s, z4.b, z3.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "sdot z20.s, z5.b, z1.b[2]\n"
-      "sdot z24.s, z5.b, z2.b[2]\n"
-      "sdot z28.s, z5.b, z3.b[2]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "sdot z17.s, z6.b, z0.b[2]\n"
-      "sdot z21.s, z6.b, z1.b[2]\n"
-      "sdot z25.s, z6.b, z2.b[2]\n"
-      "sdot z29.s, z6.b, z3.b[2]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
-      "sdot z22.s, z7.b, z1.b[2]\n"
-      "sdot z26.s, z7.b, z2.b[2]\n"
-      "sdot z30.s, z7.b, z3.b[2]\n"
+      "sdot z17.s, z7.b, z0.b[1]\n"
+      "sdot z21.s, z7.b, z1.b[1]\n"
+      "sdot z25.s, z7.b, z2.b[1]\n"
+      "sdot z29.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "sdot z18.s, z6.b, z0.b[1]\n"
+      "sdot z22.s, z6.b, z1.b[1]\n"
+      "sdot z26.s, z6.b, z2.b[1]\n"
+      "sdot z30.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "sdot z19.s, z5.b, z0.b[1]\n"
+      "sdot z23.s, z5.b, z1.b[1]\n"
+      "sdot z27.s, z5.b, z2.b[1]\n"
+      "sdot z31.s, z5.b, z3.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "sdot z16.s, z4.b, z0.b[2]\n"
+      "sdot z20.s, z4.b, z1.b[2]\n"
+      "sdot z24.s, z4.b, z2.b[2]\n"
+      "sdot z28.s, z4.b, z3.b[2]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "sdot z17.s, z10.b, z0.b[2]\n"
+      "sdot z21.s, z10.b, z1.b[2]\n"
+      "sdot z25.s, z10.b, z2.b[2]\n"
+      "sdot z29.s, z10.b, z3.b[2]\n"
+      "sdot z18.s, z9.b, z0.b[2]\n"
+      "sdot z22.s, z9.b, z1.b[2]\n"
+      "sdot z26.s, z9.b, z2.b[2]\n"
+      "sdot z30.s, z9.b, z3.b[2]\n"
       "sdot z19.s, z8.b, z0.b[2]\n"
       "sdot z23.s, z8.b, z1.b[2]\n"
       "sdot z27.s, z8.b, z2.b[2]\n"
       "sdot z31.s, z8.b, z3.b[2]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "sdot z20.s, z9.b, z1.b[3]\n"
-      "sdot z24.s, z9.b, z2.b[3]\n"
-      "sdot z28.s, z9.b, z3.b[3]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "sdot z21.s, z10.b, z1.b[3]\n"
-      "sdot z25.s, z10.b, z2.b[3]\n"
-      "sdot z29.s, z10.b, z3.b[3]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z22.s, z4.b, z1.b[3]\n"
-      "sdot z26.s, z4.b, z2.b[3]\n"
-      "sdot z30.s, z4.b, z3.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
-      "sdot z23.s, z5.b, z1.b[3]\n"
-      "sdot z27.s, z5.b, z2.b[3]\n"
-      "sdot z31.s, z5.b, z3.b[3]\n"
+      "sdot z16.s, z7.b, z0.b[3]\n"
+      "sdot z20.s, z7.b, z1.b[3]\n"
+      "sdot z24.s, z7.b, z2.b[3]\n"
+      "sdot z28.s, z7.b, z3.b[3]\n"
+      "sdot z17.s, z6.b, z0.b[3]\n"
+      "sdot z21.s, z6.b, z1.b[3]\n"
+      "sdot z25.s, z6.b, z2.b[3]\n"
+      "sdot z29.s, z6.b, z3.b[3]\n"
+      "sdot z18.s, z5.b, z0.b[3]\n"
+      "sdot z22.s, z5.b, z1.b[3]\n"
+      "sdot z26.s, z5.b, z2.b[3]\n"
+      "sdot z30.s, z5.b, z3.b[3]\n"
+      "sdot z19.s, z4.b, z0.b[3]\n"
+      "sdot z23.s, z4.b, z1.b[3]\n"
+      "sdot z27.s, z4.b, z2.b[3]\n"
+      "sdot z31.s, z4.b, z3.b[3]\n"
       "tbnz %x[flags], #31, 50f\n"
       "sdot z11.s, z0.b, z15.b\n"
       "sdot z12.s, z1.b, z15.b\n"
@@ -1157,95 +1157,95 @@
       "subs x25, x25, #0x4\n"
       "ld1rqb { z2.b }, p0/Z, [x22]\n"
       "ld1rqb { z3.b }, p0/Z, [x21]\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z16.s, z4.b, z0.b[0]\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z24.s, z4.b, z2.b[0]\n"
-      "sdot z28.s, z4.b, z3.b[0]\n"
-      "sdot z17.s, z5.b, z0.b[0]\n"
-      "sdot z21.s, z5.b, z1.b[0]\n"
-      "addvl x28, x28, #4\n"
-      "sdot z25.s, z5.b, z2.b[0]\n"
-      "sdot z29.s, z5.b, z3.b[0]\n"
-      "sdot z18.s, z6.b, z0.b[0]\n"
-      "sdot z22.s, z6.b, z1.b[0]\n"
-      "sdot z26.s, z6.b, z2.b[0]\n"
-      "sdot z30.s, z6.b, z3.b[0]\n"
-      "sdot z19.s, z7.b, z0.b[0]\n"
-      "sdot z23.s, z7.b, z1.b[0]\n"
-      "sdot z27.s, z7.b, z2.b[0]\n"
-      "sdot z31.s, z7.b, z3.b[0]\n"
-      "ble 52f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x25, x25, #0x4\n"
-      "sdot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z7.b, z0.b[0]\n"
+      "sdot z20.s, z7.b, z1.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
       "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z20.s, z8.b, z1.b[1]\n"
-      "sdot z24.s, z8.b, z2.b[1]\n"
-      "sdot z28.s, z8.b, z3.b[1]\n"
-      "sdot z17.s, z9.b, z0.b[1]\n"
+      "sdot z24.s, z7.b, z2.b[0]\n"
+      "sdot z28.s, z7.b, z3.b[0]\n"
+      "sdot z17.s, z6.b, z0.b[0]\n"
+      "sdot z21.s, z6.b, z1.b[0]\n"
       "addvl x28, x28, #4\n"
-      "sdot z21.s, z9.b, z1.b[1]\n"
-      "sdot z25.s, z9.b, z2.b[1]\n"
-      "sdot z29.s, z9.b, z3.b[1]\n"
-      "sdot z18.s, z10.b, z0.b[1]\n"
-      "sdot z22.s, z10.b, z1.b[1]\n"
-      "sdot z26.s, z10.b, z2.b[1]\n"
-      "sdot z30.s, z10.b, z3.b[1]\n"
+      "sdot z25.s, z6.b, z2.b[0]\n"
+      "sdot z29.s, z6.b, z3.b[0]\n"
+      "sdot z18.s, z5.b, z0.b[0]\n"
+      "sdot z22.s, z5.b, z1.b[0]\n"
+      "sdot z26.s, z5.b, z2.b[0]\n"
+      "sdot z30.s, z5.b, z3.b[0]\n"
+      "sdot z19.s, z4.b, z0.b[0]\n"
+      "sdot z23.s, z4.b, z1.b[0]\n"
+      "sdot z27.s, z4.b, z2.b[0]\n"
+      "sdot z31.s, z4.b, z3.b[0]\n"
+      "ble 52f\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "sdot z16.s, z7.b, z0.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z20.s, z7.b, z1.b[1]\n"
+      "sdot z24.s, z7.b, z2.b[1]\n"
+      "sdot z28.s, z7.b, z3.b[1]\n"
+      "sdot z17.s, z6.b, z0.b[1]\n"
+      "addvl x28, x28, #4\n"
+      "sdot z21.s, z6.b, z1.b[1]\n"
+      "sdot z25.s, z6.b, z2.b[1]\n"
+      "sdot z29.s, z6.b, z3.b[1]\n"
+      "sdot z18.s, z5.b, z0.b[1]\n"
+      "sdot z22.s, z5.b, z1.b[1]\n"
+      "sdot z26.s, z5.b, z2.b[1]\n"
+      "sdot z30.s, z5.b, z3.b[1]\n"
       "sdot z19.s, z4.b, z0.b[1]\n"
       "sdot z23.s, z4.b, z1.b[1]\n"
       "sdot z27.s, z4.b, z2.b[1]\n"
       "sdot z31.s, z4.b, z3.b[1]\n"
       "ble 52f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
       "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x4\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z20.s, z5.b, z1.b[2]\n"
-      "sdot z24.s, z5.b, z2.b[2]\n"
-      "sdot z28.s, z5.b, z3.b[2]\n"
+      "sdot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z20.s, z7.b, z1.b[2]\n"
+      "sdot z24.s, z7.b, z2.b[2]\n"
+      "sdot z28.s, z7.b, z3.b[2]\n"
       "sdot z17.s, z6.b, z0.b[2]\n"
       "addvl x28, x28, #4\n"
       "sdot z21.s, z6.b, z1.b[2]\n"
       "sdot z25.s, z6.b, z2.b[2]\n"
       "sdot z29.s, z6.b, z3.b[2]\n"
-      "sdot z18.s, z7.b, z0.b[2]\n"
-      "sdot z22.s, z7.b, z1.b[2]\n"
-      "sdot z26.s, z7.b, z2.b[2]\n"
-      "sdot z30.s, z7.b, z3.b[2]\n"
-      "sdot z19.s, z8.b, z0.b[2]\n"
-      "sdot z23.s, z8.b, z1.b[2]\n"
-      "sdot z27.s, z8.b, z2.b[2]\n"
-      "sdot z31.s, z8.b, z3.b[2]\n"
+      "sdot z18.s, z5.b, z0.b[2]\n"
+      "sdot z22.s, z5.b, z1.b[2]\n"
+      "sdot z26.s, z5.b, z2.b[2]\n"
+      "sdot z30.s, z5.b, z3.b[2]\n"
+      "sdot z19.s, z4.b, z0.b[2]\n"
+      "sdot z23.s, z4.b, z1.b[2]\n"
+      "sdot z27.s, z4.b, z2.b[2]\n"
+      "sdot z31.s, z4.b, z3.b[2]\n"
       "ble 52f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "sdot z16.s, z9.b, z0.b[3]\n"
-      "sdot z20.s, z9.b, z1.b[3]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "sdot z24.s, z9.b, z2.b[3]\n"
-      "sdot z28.s, z9.b, z3.b[3]\n"
-      "sdot z17.s, z10.b, z0.b[3]\n"
-      "sdot z21.s, z10.b, z1.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sdot z16.s, z7.b, z0.b[3]\n"
+      "sdot z20.s, z7.b, z1.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z24.s, z7.b, z2.b[3]\n"
+      "sdot z28.s, z7.b, z3.b[3]\n"
+      "sdot z17.s, z6.b, z0.b[3]\n"
+      "sdot z21.s, z6.b, z1.b[3]\n"
       "addvl x28, x28, #4\n"
-      "sdot z25.s, z10.b, z2.b[3]\n"
-      "sdot z29.s, z10.b, z3.b[3]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z22.s, z4.b, z1.b[3]\n"
-      "sdot z26.s, z4.b, z2.b[3]\n"
-      "sdot z30.s, z4.b, z3.b[3]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
-      "sdot z23.s, z5.b, z1.b[3]\n"
-      "sdot z27.s, z5.b, z2.b[3]\n"
-      "sdot z31.s, z5.b, z3.b[3]\n"
+      "sdot z25.s, z6.b, z2.b[3]\n"
+      "sdot z29.s, z6.b, z3.b[3]\n"
+      "sdot z18.s, z5.b, z0.b[3]\n"
+      "sdot z22.s, z5.b, z1.b[3]\n"
+      "sdot z26.s, z5.b, z2.b[3]\n"
+      "sdot z30.s, z5.b, z3.b[3]\n"
+      "sdot z19.s, z4.b, z0.b[3]\n"
+      "sdot z23.s, z4.b, z1.b[3]\n"
+      "sdot z27.s, z4.b, z2.b[3]\n"
+      "sdot z31.s, z4.b, z3.b[3]\n"
       "52:"  // Height 4: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 53f\n"
       "sdot z11.s, z0.b, z15.b\n"
@@ -1265,7 +1265,7 @@
       "mov x20, #0x4\n"
       "whilelt p0.s, XZR, x20\n"
       "add x20, %x[qp], %[b_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
       "saddv d11, p0, z11.s\n"
       "mov z11.s, z11.s[0]\n"
       "saddv d12, p0, z12.s\n"
@@ -1273,28 +1273,28 @@
       "mov z12.s, z12.s[0]\n"
       "mov z13.s, z13.s[0]\n"
       "saddv d14, p0, z14.s\n"
-      "neg z4.s, p2/M, z4.s\n"
+      "neg z0.s, p2/M, z0.s\n"
       "mov z14.s, z14.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z4.s\n"
-      "mul z12.s, p2/M, z12.s, z4.s\n"
-      "mul z13.s, p2/M, z13.s, z4.s\n"
-      "mul z14.s, p2/M, z14.s, z4.s\n"
+      "mul z11.s, p2/M, z11.s, z0.s\n"
+      "mul z12.s, p2/M, z12.s, z0.s\n"
+      "mul z13.s, p2/M, z13.s, z0.s\n"
+      "mul z14.s, p2/M, z14.s, z0.s\n"
       "54:"  // Height 4: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z4.s }, p2/Z, [x10]\n"
+      "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z20.s, z20.s, z12.s\n"
       "add z21.s, z21.s, z12.s\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "add z22.s, z22.s, z12.s\n"
       "add z23.s, z23.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add z24.s, z24.s, z13.s\n"
       "add z25.s, z25.s, z13.s\n"
@@ -1305,174 +1305,174 @@
       "add z29.s, z29.s, z14.s\n"
       "add z30.s, z30.s, z14.s\n"
       "add z31.s, z31.s, z14.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
-      "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
-      "add z28.s, z28.s, z0.s\n"
-      "add z29.s, z29.s, z1.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z0.s\n"
+      "add z18.s, z18.s, z3.s\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "add z21.s, z21.s, z0.s\n"
+      "add z22.s, z22.s, z3.s\n"
+      "add z23.s, z23.s, z2.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z0.s\n"
+      "add z26.s, z26.s, z3.s\n"
+      "add z27.s, z27.s, z2.s\n"
+      "add z28.s, z28.s, z4.s\n"
+      "add z29.s, z29.s, z0.s\n"
       "ld1rw { z0.s }, p2/Z, [x20]\n"
-      "add z30.s, z30.s, z2.s\n"
-      "add z31.s, z31.s, z3.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
-      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
-      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
-      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
-      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
-      ".inst 0x04a4779c  // sqrdmulh z28.s, z28.s, z4.s\n"
-      ".inst 0x04a477bd  // sqrdmulh z29.s, z29.s, z4.s\n"
-      ".inst 0x04a477de  // sqrdmulh z30.s, z30.s, z4.s\n"
-      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      "add z30.s, z30.s, z3.s\n"
+      "add z31.s, z31.s, z2.s\n"
+      ".inst 0x04a17610  // sqrdmulh z16.s, z16.s, z1.s\n"
+      ".inst 0x04a17631  // sqrdmulh z17.s, z17.s, z1.s\n"
+      ".inst 0x04a17652  // sqrdmulh z18.s, z18.s, z1.s\n"
+      ".inst 0x04a17673  // sqrdmulh z19.s, z19.s, z1.s\n"
+      ".inst 0x04a17694  // sqrdmulh z20.s, z20.s, z1.s\n"
+      ".inst 0x04a176b5  // sqrdmulh z21.s, z21.s, z1.s\n"
+      ".inst 0x04a176d6  // sqrdmulh z22.s, z22.s, z1.s\n"
+      ".inst 0x04a176f7  // sqrdmulh z23.s, z23.s, z1.s\n"
+      ".inst 0x04a17718  // sqrdmulh z24.s, z24.s, z1.s\n"
+      ".inst 0x04a17739  // sqrdmulh z25.s, z25.s, z1.s\n"
+      ".inst 0x04a1775a  // sqrdmulh z26.s, z26.s, z1.s\n"
+      ".inst 0x04a1777b  // sqrdmulh z27.s, z27.s, z1.s\n"
+      ".inst 0x04a1779c  // sqrdmulh z28.s, z28.s, z1.s\n"
+      ".inst 0x04a177bd  // sqrdmulh z29.s, z29.s, z1.s\n"
+      ".inst 0x04a177de  // sqrdmulh z30.s, z30.s, z1.s\n"
+      ".inst 0x04a177ff  // sqrdmulh z31.s, z31.s, z1.s\n"
       "tbz %x[flags], #5, 55f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "and z5.d, z17.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "and z6.d, z18.d, z0.d\n"
-      "and z7.d, z19.d, z0.d\n"
-      "and z8.d, z20.d, z0.d\n"
-      "and z9.d, z21.d, z0.d\n"
-      "and z10.d, z22.d, z0.d\n"
-      "and z4.d, z23.d, z0.d\n"
-      "and z5.d, z24.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
+      "and z2.d, z16.d, z0.d\n"
+      "and z1.d, z17.d, z0.d\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z2.s\n"
+      "sqadd z17.s, z17.s, z1.s\n"
+      "and z7.d, z18.d, z0.d\n"
+      "and z6.d, z19.d, z0.d\n"
+      "and z5.d, z20.d, z0.d\n"
+      "and z4.d, z21.d, z0.d\n"
+      "and z3.d, z22.d, z0.d\n"
+      "and z2.d, z23.d, z0.d\n"
+      "and z1.d, z24.d, z0.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "sqadd z20.s, z20.s, z8.s\n"
-      "sqadd z21.s, z21.s, z9.s\n"
-      "sqadd z22.s, z22.s, z10.s\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "sqadd z24.s, z24.s, z5.s\n"
-      "and z6.d, z25.d, z0.d\n"
-      "and z7.d, z26.d, z0.d\n"
-      "and z8.d, z27.d, z0.d\n"
-      "and z9.d, z28.d, z0.d\n"
-      "and z10.d, z29.d, z0.d\n"
-      "and z4.d, z30.d, z0.d\n"
-      "and z5.d, z31.d, z0.d\n"
       "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z25.s, z25.s, z6.s\n"
-      "sqadd z26.s, z26.s, z7.s\n"
-      "sqadd z27.s, z27.s, z8.s\n"
-      "sqadd z28.s, z28.s, z9.s\n"
-      "sqadd z29.s, z29.s, z10.s\n"
-      "sqadd z30.s, z30.s, z4.s\n"
-      "sqadd z31.s, z31.s, z5.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z7.s\n"
+      "sqadd z19.s, z19.s, z6.s\n"
+      "sqadd z20.s, z20.s, z5.s\n"
+      "sqadd z21.s, z21.s, z4.s\n"
+      "sqadd z22.s, z22.s, z3.s\n"
+      "sqadd z23.s, z23.s, z2.s\n"
+      "sqadd z24.s, z24.s, z1.s\n"
+      "and z7.d, z25.d, z0.d\n"
+      "and z6.d, z26.d, z0.d\n"
+      "and z5.d, z27.d, z0.d\n"
+      "and z4.d, z28.d, z0.d\n"
+      "and z3.d, z29.d, z0.d\n"
+      "and z2.d, z30.d, z0.d\n"
+      "and z1.d, z31.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z7.s\n"
+      "sqadd z26.s, z26.s, z6.s\n"
+      "sqadd z27.s, z27.s, z5.s\n"
+      "sqadd z28.s, z28.s, z4.s\n"
+      "sqadd z29.s, z29.s, z3.s\n"
+      "sqadd z30.s, z30.s, z2.s\n"
+      "sqadd z31.s, z31.s, z1.s\n"
       "55:"  // Height 4: no shift correction
       "add x20, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z16.s, z16.s, z2.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z2.s\n"
+      "add z18.s, z18.s, z2.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z20.s, z20.s, z4.s\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z20.s, z20.s, z2.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
-      "add z21.s, z21.s, z4.s\n"
-      "add z22.s, z22.s, z4.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z2.s\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      "add z23.s, z23.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
+      "add z23.s, z23.s, z2.s\n"
+      "add z24.s, z24.s, z2.s\n"
       ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
       ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "add z26.s, z26.s, z4.s\n"
+      "add z25.s, z25.s, z2.s\n"
+      "add z26.s, z26.s, z2.s\n"
       ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
       ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
-      "add z27.s, z27.s, z4.s\n"
-      "add z28.s, z28.s, z4.s\n"
+      "add z27.s, z27.s, z2.s\n"
+      "add z28.s, z28.s, z2.s\n"
       ".inst 0x4482881d  // srshl z29.s, p2/M, z29.s, z0.s\n"
       ".inst 0x4482881e  // srshl z30.s, p2/M, z30.s, z0.s\n"
-      "add z29.s, z29.s, z4.s\n"
-      "add z30.s, z30.s, z4.s\n"
+      "add z29.s, z29.s, z2.s\n"
+      "add z30.s, z30.s, z2.s\n"
       ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
       "add x20, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x20]\n"
-      "add z31.s, z31.s, z4.s\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add z31.s, z31.s, z2.s\n"
       "add x20, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smin z28.s, p2/M, z28.s, z6.s\n"
-      "smin z29.s, p2/M, z29.s, z6.s\n"
-      "smin z30.s, p2/M, z30.s, z6.s\n"
-      "smin z31.s, p2/M, z31.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z1.s\n"
+      "smin z17.s, p2/M, z17.s, z1.s\n"
+      "smin z18.s, p2/M, z18.s, z1.s\n"
+      "smin z19.s, p2/M, z19.s, z1.s\n"
+      "smin z20.s, p2/M, z20.s, z1.s\n"
+      "smin z21.s, p2/M, z21.s, z1.s\n"
+      "smin z22.s, p2/M, z22.s, z1.s\n"
+      "smin z23.s, p2/M, z23.s, z1.s\n"
+      "smin z24.s, p2/M, z24.s, z1.s\n"
+      "smin z25.s, p2/M, z25.s, z1.s\n"
+      "smin z26.s, p2/M, z26.s, z1.s\n"
+      "smin z27.s, p2/M, z27.s, z1.s\n"
+      "smin z28.s, p2/M, z28.s, z1.s\n"
+      "smin z29.s, p2/M, z29.s, z1.s\n"
+      "smin z30.s, p2/M, z30.s, z1.s\n"
+      "smin z31.s, p2/M, z31.s, z1.s\n"
+      "smax z16.s, p2/M, z16.s, z0.s\n"
+      "smax z17.s, p2/M, z17.s, z0.s\n"
+      "smax z18.s, p2/M, z18.s, z0.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z0.s\n"
+      "smax z20.s, p2/M, z20.s, z0.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z0.s\n"
+      "smax z22.s, p2/M, z22.s, z0.s\n"
       "uzp1 z20.h, z20.h, z21.h\n"
       "st1b { z16.b }, p1, [x27]\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z0.s\n"
+      "smax z24.s, p2/M, z24.s, z0.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z0.s\n"
+      "smax z26.s, p2/M, z26.s, z0.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
       "st1b { z20.b }, p1, [x23]\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "smax z28.s, p2/M, z28.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "smax z29.s, p2/M, z29.s, z5.s\n"
-      "smax z30.s, p2/M, z30.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z0.s\n"
+      "smax z28.s, p2/M, z28.s, z0.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "smax z29.s, p2/M, z29.s, z0.s\n"
+      "smax z30.s, p2/M, z30.s, z0.s\n"
       "uzp1 z28.h, z28.h, z29.h\n"
       "st1b { z24.b }, p1, [x22]\n"
-      "smax z31.s, p2/M, z31.s, z5.s\n"
-      "uzp1 z29.h, z30.h, z31.h\n"
-      "uzp1 z28.b, z28.b, z29.b\n"
+      "smax z31.s, p2/M, z31.s, z0.s\n"
+      "uzp1 z16.h, z30.h, z31.h\n"
+      "uzp1 z28.b, z28.b, z16.b\n"
       "st1b { z28.b }, p1, [x21]\n"
       "addvl x27, x27, #1\n"
       "56:"  // Height 4: Writeback done
@@ -1491,7 +1491,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "58:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1499,4 +1498,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp
index 9681505..ae922e9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -74,7 +74,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, int8_t>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -97,5 +96,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp
index 626a06b..e062836 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp
@@ -108,11 +108,11 @@
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
       "cbnz x26, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -125,41 +125,41 @@
       "7:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x25\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
-      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
-      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "trn1 z0.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45189810  // smmla z16.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "trn2 z1.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45199814  // smmla z20.s, z0.b, z25.b\n"
+      ".inst 0x45189811  // smmla z17.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x451a9815  // smmla z21.s, z0.b, z26.b\n"
+      ".inst 0x45199812  // smmla z18.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
-      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
-      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
-      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
-      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
-      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
+      ".inst 0x45189816  // smmla z22.s, z0.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x451a9813  // smmla z19.s, z0.b, z26.b\n"
+      ".inst 0x45199817  // smmla z23.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45189830  // smmla z16.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x451a9834  // smmla z20.s, z1.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45199831  // smmla z17.s, z1.b, z25.b\n"
+      ".inst 0x45189835  // smmla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x451b9832  // smmla z18.s, z1.b, z27.b\n"
+      ".inst 0x451a9836  // smmla z22.s, z1.b, z26.b\n"
+      ".inst 0x45199833  // smmla z19.s, z1.b, z25.b\n"
+      ".inst 0x45189837  // smmla z23.s, z1.b, z24.b\n"
       "add x24, x24, #0x10\n"
       "tbnz %x[flags], #31, 8f\n"
       "sdot z11.s, z0.b, z15.b\n"
@@ -171,43 +171,43 @@
       "9:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x25\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "trn1 z0.d, z1.d, z27.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45189810  // smmla z16.s, z0.b, z24.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x8\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
-      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
-      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
-      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
-      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "trn2 z1.d, z1.d, z27.d\n"
+      ".inst 0x451a9814  // smmla z20.s, z0.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45199811  // smmla z17.s, z0.b, z25.b\n"
+      ".inst 0x45189815  // smmla z21.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x451b9812  // smmla z18.s, z0.b, z27.b\n"
+      ".inst 0x451a9816  // smmla z22.s, z0.b, z26.b\n"
+      ".inst 0x45199813  // smmla z19.s, z0.b, z25.b\n"
+      ".inst 0x45189817  // smmla z23.s, z0.b, z24.b\n"
       "addvl x28, x28, #8\n"
       "ble 10f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
-      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
-      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
-      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
-      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
-      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45189830  // smmla z16.s, z1.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45189834  // smmla z20.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45199831  // smmla z17.s, z1.b, z25.b\n"
+      ".inst 0x45189835  // smmla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45199832  // smmla z18.s, z1.b, z25.b\n"
+      ".inst 0x45189836  // smmla z22.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45199833  // smmla z19.s, z1.b, z25.b\n"
+      ".inst 0x45189837  // smmla z23.s, z1.b, z24.b\n"
       "addvl x28, x28, #8\n"
       "10:"  // Height 1: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 11f\n"
@@ -224,74 +224,74 @@
       "uzp1 z19.d, z19.d, z23.d\n"
       "mov z23.d, z16.d\n"
       "tbnz %x[flags], #31, 12f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1rw { z1.s }, p2/Z, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
       ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
-      "neg z1.s, p2/M, z1.s\n"
+      "neg z16.s, p2/M, z16.s\n"
       "mov z11.s, z11.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z1.s\n"
+      "mul z11.s, p2/M, z11.s, z16.s\n"
       "12:"  // Height 1: skip row sum fixup
       "add z23.s, z23.s, z11.s\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x10]\n"
+      "ld1w { z21.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
+      "ld1w { z20.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add z23.s, z23.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      "add z23.s, z23.s, z22.s\n"
+      "add z17.s, z17.s, z21.s\n"
+      "add z18.s, z18.s, z20.s\n"
+      "add z19.s, z19.s, z16.s\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      ".inst 0x04b076f7  // sqrdmulh z23.s, z23.s, z16.s\n"
+      ".inst 0x04b07631  // sqrdmulh z17.s, z17.s, z16.s\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04b07652  // sqrdmulh z18.s, z18.s, z16.s\n"
+      ".inst 0x04b07673  // sqrdmulh z19.s, z19.s, z16.s\n"
       "tbz %x[flags], #5, 13f\n"
-      "and z4.d, z23.d, z0.d\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "and z7.d, z19.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
+      "and z22.d, z23.d, z0.d\n"
+      "and z21.d, z17.d, z0.d\n"
+      "and z20.d, z18.d, z0.d\n"
+      "and z16.d, z19.d, z0.d\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z22.s\n"
+      "sqadd z17.s, z17.s, z21.s\n"
+      "sqadd z18.s, z18.s, z20.s\n"
+      "sqadd z19.s, z19.s, z16.s\n"
       "13:"  // Height 1: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
-      "add z23.s, z23.s, z4.s\n"
+      "add z23.s, z23.s, z16.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z16.s\n"
+      "add z18.s, z18.s, z16.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x23]\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x23]\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z16.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "smin z23.s, p2/M, z23.s, z20.s\n"
+      "smin z17.s, p2/M, z17.s, z20.s\n"
+      "smin z18.s, p2/M, z18.s, z20.s\n"
+      "smin z19.s, p2/M, z19.s, z20.s\n"
+      "smax z23.s, p2/M, z23.s, z16.s\n"
+      "smax z17.s, p2/M, z17.s, z16.s\n"
+      "smax z18.s, p2/M, z18.s, z16.s\n"
       "uzp1 z23.h, z23.h, z17.h\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "uzp1 z17.h, z18.h, z19.h\n"
-      "uzp1 z23.b, z23.b, z17.b\n"
+      "smax z19.s, p2/M, z19.s, z16.s\n"
+      "uzp1 z16.h, z18.h, z19.h\n"
+      "uzp1 z23.b, z23.b, z16.b\n"
       "st1b { z23.b }, p1, [x27]\n"
       "addvl x27, x27, #1\n"
       "14:"  // Height 1: Writeback done
@@ -324,12 +324,12 @@
       "18:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 19f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
       "cbnz x26, 20f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -337,49 +337,49 @@
       "b 20f\n"
       "19:"  // Height 2: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
+      "add x23, x24, x21\n"
       "20:"  // Height 2: input setup done
       "cmp x25, #0x10\n"
       "ble 23f\n"
       "21:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x25\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
-      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
-      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "ld1rqb { z26.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45189810  // smmla z16.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "trn2 z1.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45199814  // smmla z20.s, z0.b, z25.b\n"
+      ".inst 0x45189811  // smmla z17.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x451a9815  // smmla z21.s, z0.b, z26.b\n"
+      ".inst 0x45199812  // smmla z18.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
-      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
-      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
-      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
-      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
-      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
+      ".inst 0x45189816  // smmla z22.s, z0.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x451a9813  // smmla z19.s, z0.b, z26.b\n"
+      ".inst 0x45199817  // smmla z23.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45189830  // smmla z16.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x451a9834  // smmla z20.s, z1.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45199831  // smmla z17.s, z1.b, z25.b\n"
+      ".inst 0x45189835  // smmla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x451b9832  // smmla z18.s, z1.b, z27.b\n"
+      ".inst 0x451a9836  // smmla z22.s, z1.b, z26.b\n"
+      ".inst 0x45199833  // smmla z19.s, z1.b, z25.b\n"
+      ".inst 0x45189837  // smmla z23.s, z1.b, z24.b\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       "tbnz %x[flags], #31, 22f\n"
@@ -392,44 +392,44 @@
       "23:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x25\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z27.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45189810  // smmla z16.s, z0.b, z24.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x8\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
-      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
-      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
-      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
-      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "trn2 z1.d, z1.d, z27.d\n"
+      ".inst 0x451a9814  // smmla z20.s, z0.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45199811  // smmla z17.s, z0.b, z25.b\n"
+      ".inst 0x45189815  // smmla z21.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x451b9812  // smmla z18.s, z0.b, z27.b\n"
+      ".inst 0x451a9816  // smmla z22.s, z0.b, z26.b\n"
+      ".inst 0x45199813  // smmla z19.s, z0.b, z25.b\n"
+      ".inst 0x45189817  // smmla z23.s, z0.b, z24.b\n"
       "addvl x28, x28, #8\n"
       "ble 24f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
-      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
-      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
-      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
-      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
-      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45189830  // smmla z16.s, z1.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45189834  // smmla z20.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45199831  // smmla z17.s, z1.b, z25.b\n"
+      ".inst 0x45189835  // smmla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45199832  // smmla z18.s, z1.b, z25.b\n"
+      ".inst 0x45189836  // smmla z22.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45199833  // smmla z19.s, z1.b, z25.b\n"
+      ".inst 0x45189837  // smmla z23.s, z1.b, z24.b\n"
       "addvl x28, x28, #8\n"
       "24:"  // Height 2: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 25f\n"
@@ -440,133 +440,133 @@
       "add x26, x26, #0x1\n"
       "cmp x26, x20\n"
       "bne 18b\n"
-      "uzp1 z7.d, z16.d, z20.d\n"
+      "uzp1 z24.d, z16.d, z20.d\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "uzp2 z16.d, z16.d, z20.d\n"
-      "add x22, x27, x20\n"
+      "add x23, x27, x20\n"
       "uzp1 z20.d, z17.d, z21.d\n"
       "uzp2 z17.d, z17.d, z21.d\n"
       "uzp1 z21.d, z18.d, z22.d\n"
       "uzp2 z18.d, z18.d, z22.d\n"
       "uzp1 z22.d, z19.d, z23.d\n"
       "uzp2 z19.d, z19.d, z23.d\n"
-      "mov z23.d, z7.d\n"
+      "mov z23.d, z24.d\n"
       "tbnz %x[flags], #31, 26f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1rw { z2.s }, p2/Z, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
       ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
-      "neg z2.s, p2/M, z2.s\n"
+      "neg z24.s, p2/M, z24.s\n"
       "mov z12.s, z11.s[3]\n"
       "mov z11.s, z11.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z2.s\n"
-      "mul z12.s, p2/M, z12.s, z2.s\n"
+      "mul z11.s, p2/M, z11.s, z24.s\n"
+      "mul z12.s, p2/M, z12.s, z24.s\n"
       "26:"  // Height 2: skip row sum fixup
       "add z23.s, z23.s, z11.s\n"
       "add z20.s, z20.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x10]\n"
+      "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z21.s, z21.s, z11.s\n"
       "add z22.s, z22.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z16.s, z16.s, z12.s\n"
       "add z17.s, z17.s, z12.s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "add z18.s, z18.s, z12.s\n"
       "add z19.s, z19.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "add z23.s, z23.s, z0.s\n"
-      "add z20.s, z20.s, z1.s\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z23.s, z23.s, z28.s\n"
+      "add z20.s, z20.s, z27.s\n"
       "addvl x10, x10, #4\n"
-      "add z21.s, z21.s, z2.s\n"
-      "add z22.s, z22.s, z3.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      "add z21.s, z21.s, z26.s\n"
+      "add z22.s, z22.s, z25.s\n"
+      "add z16.s, z16.s, z28.s\n"
+      "add z17.s, z17.s, z27.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z18.s, z18.s, z26.s\n"
+      "add z19.s, z19.s, z25.s\n"
+      ".inst 0x04b876f7  // sqrdmulh z23.s, z23.s, z24.s\n"
+      ".inst 0x04b87694  // sqrdmulh z20.s, z20.s, z24.s\n"
+      ".inst 0x04b876b5  // sqrdmulh z21.s, z21.s, z24.s\n"
+      ".inst 0x04b876d6  // sqrdmulh z22.s, z22.s, z24.s\n"
+      ".inst 0x04b87610  // sqrdmulh z16.s, z16.s, z24.s\n"
+      ".inst 0x04b87631  // sqrdmulh z17.s, z17.s, z24.s\n"
+      ".inst 0x04b87652  // sqrdmulh z18.s, z18.s, z24.s\n"
+      ".inst 0x04b87673  // sqrdmulh z19.s, z19.s, z24.s\n"
       "tbz %x[flags], #5, 27f\n"
-      "and z4.d, z23.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "and z5.d, z20.d, z0.d\n"
-      "and z6.d, z21.d, z0.d\n"
-      "and z7.d, z22.d, z0.d\n"
-      "and z8.d, z16.d, z0.d\n"
-      "and z9.d, z17.d, z0.d\n"
-      "and z10.d, z18.d, z0.d\n"
-      "and z4.d, z19.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z20.s, z20.s, z5.s\n"
-      "sqadd z21.s, z21.s, z6.s\n"
-      "sqadd z22.s, z22.s, z7.s\n"
-      "sqadd z16.s, z16.s, z8.s\n"
-      "sqadd z17.s, z17.s, z9.s\n"
-      "sqadd z18.s, z18.s, z10.s\n"
-      "sqadd z19.s, z19.s, z4.s\n"
+      "and z24.d, z23.d, z0.d\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z24.s\n"
+      "and z30.d, z20.d, z0.d\n"
+      "and z29.d, z21.d, z0.d\n"
+      "and z28.d, z22.d, z0.d\n"
+      "and z27.d, z16.d, z0.d\n"
+      "and z26.d, z17.d, z0.d\n"
+      "and z25.d, z18.d, z0.d\n"
+      "and z24.d, z19.d, z0.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z30.s\n"
+      "sqadd z21.s, z21.s, z29.s\n"
+      "sqadd z22.s, z22.s, z28.s\n"
+      "sqadd z16.s, z16.s, z27.s\n"
+      "sqadd z17.s, z17.s, z26.s\n"
+      "sqadd z18.s, z18.s, z25.s\n"
+      "sqadd z19.s, z19.s, z24.s\n"
       "27:"  // Height 2: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
-      "add z23.s, z23.s, z4.s\n"
+      "add z23.s, z23.s, z24.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "add z21.s, z21.s, z4.s\n"
+      "add z20.s, z20.s, z24.s\n"
+      "add z21.s, z21.s, z24.s\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z22.s, z22.s, z24.s\n"
+      "add z16.s, z16.s, z24.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z24.s\n"
+      "add z18.s, z18.s, z24.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x23]\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x23]\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z24.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "smin z23.s, p2/M, z23.s, z25.s\n"
+      "smin z20.s, p2/M, z20.s, z25.s\n"
+      "smin z21.s, p2/M, z21.s, z25.s\n"
+      "smin z22.s, p2/M, z22.s, z25.s\n"
+      "smin z16.s, p2/M, z16.s, z25.s\n"
+      "smin z17.s, p2/M, z17.s, z25.s\n"
+      "smin z18.s, p2/M, z18.s, z25.s\n"
+      "smin z19.s, p2/M, z19.s, z25.s\n"
+      "smax z23.s, p2/M, z23.s, z24.s\n"
+      "smax z20.s, p2/M, z20.s, z24.s\n"
+      "smax z21.s, p2/M, z21.s, z24.s\n"
       "uzp1 z23.h, z23.h, z20.h\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z22.s, p2/M, z22.s, z24.s\n"
+      "smax z16.s, p2/M, z16.s, z24.s\n"
       "uzp1 z20.h, z21.h, z22.h\n"
       "uzp1 z23.b, z23.b, z20.b\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z24.s\n"
+      "smax z18.s, p2/M, z18.s, z24.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
       "st1b { z23.b }, p1, [x27]\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z24.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x22]\n"
+      "st1b { z16.b }, p1, [x23]\n"
       "addvl x27, x27, #1\n"
       "28:"  // Height 2: Writeback done
       "decw x9, ALL, MUL #4\n"
@@ -607,13 +607,13 @@
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
       "cbnz x26, 34f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -622,8 +622,8 @@
       "b 34f\n"
       "33:"  // Height 3: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "34:"  // Height 3: input setup done
       "cmp x25, #0x10\n"
       "ble 37f\n"
@@ -634,60 +634,60 @@
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
       "trn1 z0.d, z1.d, z2.d\n"
       "trn2 z1.d, z1.d, z2.d\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      ".inst 0x45059858  // smmla z24.s, z2.b, z5.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x4506985c  // smmla z28.s, z2.b, z6.b\n"
-      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      "trn1 z2.d, z3.d, z5.d\n"
+      "trn2 z3.d, z3.d, z5.d\n"
+      ".inst 0x45049810  // smmla z16.s, z0.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45049858  // smmla z24.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45059814  // smmla z20.s, z0.b, z5.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4505985c  // smmla z28.s, z2.b, z5.b\n"
+      ".inst 0x45049811  // smmla z17.s, z0.b, z4.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
       "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      ".inst 0x45079859  // smmla z25.s, z2.b, z7.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
-      ".inst 0x4508985d  // smmla z29.s, z2.b, z8.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
-      ".inst 0x4509985a  // smmla z26.s, z2.b, z9.b\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
-      ".inst 0x450a985e  // smmla z30.s, z2.b, z10.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x45049859  // smmla z25.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45099815  // smmla z21.s, z0.b, z9.b\n"
+      ".inst 0x4509985d  // smmla z29.s, z2.b, z9.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45089812  // smmla z18.s, z0.b, z8.b\n"
+      ".inst 0x4508985a  // smmla z26.s, z2.b, z8.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45079816  // smmla z22.s, z0.b, z7.b\n"
+      ".inst 0x4507985e  // smmla z30.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
-      ".inst 0x4504985b  // smmla z27.s, z2.b, z4.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45069813  // smmla z19.s, z0.b, z6.b\n"
+      ".inst 0x4506985b  // smmla z27.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
       "add x23, x23, #0x10\n"
       ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
       ".inst 0x4505985f  // smmla z31.s, z2.b, z5.b\n"
       "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
-      ".inst 0x45069878  // smmla z24.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
-      ".inst 0x4507987c  // smmla z28.s, z3.b, z7.b\n"
-      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
-      ".inst 0x45089879  // smmla z25.s, z3.b, z8.b\n"
-      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
-      ".inst 0x4509987d  // smmla z29.s, z3.b, z9.b\n"
-      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
-      ".inst 0x450a987a  // smmla z26.s, z3.b, z10.b\n"
-      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
-      ".inst 0x4504987e  // smmla z30.s, z3.b, z4.b\n"
+      ".inst 0x45049830  // smmla z16.s, z1.b, z4.b\n"
+      ".inst 0x45049878  // smmla z24.s, z3.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x450a9834  // smmla z20.s, z1.b, z10.b\n"
+      ".inst 0x450a987c  // smmla z28.s, z3.b, z10.b\n"
+      ".inst 0x45099831  // smmla z17.s, z1.b, z9.b\n"
+      ".inst 0x45099879  // smmla z25.s, z3.b, z9.b\n"
+      ".inst 0x45089835  // smmla z21.s, z1.b, z8.b\n"
+      ".inst 0x4508987d  // smmla z29.s, z3.b, z8.b\n"
+      ".inst 0x45079832  // smmla z18.s, z1.b, z7.b\n"
+      ".inst 0x4507987a  // smmla z26.s, z3.b, z7.b\n"
+      ".inst 0x45069836  // smmla z22.s, z1.b, z6.b\n"
+      ".inst 0x4506987e  // smmla z30.s, z3.b, z6.b\n"
       ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
       ".inst 0x4505987b  // smmla z27.s, z3.b, z5.b\n"
-      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
-      ".inst 0x4506987f  // smmla z31.s, z3.b, z6.b\n"
+      ".inst 0x45049837  // smmla z23.s, z1.b, z4.b\n"
+      ".inst 0x4504987f  // smmla z31.s, z3.b, z4.b\n"
       "tbnz %x[flags], #31, 36f\n"
       "sdot z11.s, z0.b, z15.b\n"
       "sdot z13.s, z2.b, z15.b\n"
@@ -708,56 +708,56 @@
       "trn1 z2.d, z3.d, z4.d\n"
       "trn2 z3.d, z3.d, z4.d\n"
       ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
       ".inst 0x45059858  // smmla z24.s, z2.b, z5.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
       "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
       "subs x25, x25, #0x8\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
-      ".inst 0x4506985c  // smmla z28.s, z2.b, z6.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
-      ".inst 0x45079859  // smmla z25.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45049814  // smmla z20.s, z0.b, z4.b\n"
+      ".inst 0x4504985c  // smmla z28.s, z2.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45099811  // smmla z17.s, z0.b, z9.b\n"
+      ".inst 0x45099859  // smmla z25.s, z2.b, z9.b\n"
       ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
       ".inst 0x4508985d  // smmla z29.s, z2.b, z8.b\n"
       "addvl x28, x28, #8\n"
-      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
-      ".inst 0x4509985a  // smmla z26.s, z2.b, z9.b\n"
-      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
-      ".inst 0x450a985e  // smmla z30.s, z2.b, z10.b\n"
-      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
-      ".inst 0x4504985b  // smmla z27.s, z2.b, z4.b\n"
-      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
-      ".inst 0x4505985f  // smmla z31.s, z2.b, z5.b\n"
+      ".inst 0x45079812  // smmla z18.s, z0.b, z7.b\n"
+      ".inst 0x4507985a  // smmla z26.s, z2.b, z7.b\n"
+      ".inst 0x45069816  // smmla z22.s, z0.b, z6.b\n"
+      ".inst 0x4506985e  // smmla z30.s, z2.b, z6.b\n"
+      ".inst 0x45059813  // smmla z19.s, z0.b, z5.b\n"
+      ".inst 0x4505985b  // smmla z27.s, z2.b, z5.b\n"
+      ".inst 0x45049817  // smmla z23.s, z0.b, z4.b\n"
+      ".inst 0x4504985f  // smmla z31.s, z2.b, z4.b\n"
       "ble 38f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
-      ".inst 0x45069878  // smmla z24.s, z3.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
-      ".inst 0x4507987c  // smmla z28.s, z3.b, z7.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
-      ".inst 0x45089879  // smmla z25.s, z3.b, z8.b\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      ".inst 0x45049830  // smmla z16.s, z1.b, z4.b\n"
+      ".inst 0x45049878  // smmla z24.s, z3.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45059834  // smmla z20.s, z1.b, z5.b\n"
+      ".inst 0x4505987c  // smmla z28.s, z3.b, z5.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45049831  // smmla z17.s, z1.b, z4.b\n"
+      ".inst 0x45049879  // smmla z25.s, z3.b, z4.b\n"
       "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
-      ".inst 0x4509987d  // smmla z29.s, z3.b, z9.b\n"
-      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
-      ".inst 0x450a987a  // smmla z26.s, z3.b, z10.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45089835  // smmla z21.s, z1.b, z8.b\n"
+      ".inst 0x4508987d  // smmla z29.s, z3.b, z8.b\n"
+      ".inst 0x45079832  // smmla z18.s, z1.b, z7.b\n"
+      ".inst 0x4507987a  // smmla z26.s, z3.b, z7.b\n"
       "addvl x28, x28, #8\n"
-      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
-      ".inst 0x4504987e  // smmla z30.s, z3.b, z4.b\n"
+      ".inst 0x45069836  // smmla z22.s, z1.b, z6.b\n"
+      ".inst 0x4506987e  // smmla z30.s, z3.b, z6.b\n"
       ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
       ".inst 0x4505987b  // smmla z27.s, z3.b, z5.b\n"
-      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
-      ".inst 0x4506987f  // smmla z31.s, z3.b, z6.b\n"
+      ".inst 0x45049837  // smmla z23.s, z1.b, z4.b\n"
+      ".inst 0x4504987f  // smmla z31.s, z3.b, z4.b\n"
       "38:"  // Height 3: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 39f\n"
       "sdot z11.s, z0.b, z15.b\n"
@@ -770,12 +770,12 @@
       "cmp x26, x20\n"
       "bne 32b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "uzp1 z7.d, z16.d, z20.d\n"
-      "add x22, x27, x20\n"
+      "uzp1 z0.d, z16.d, z20.d\n"
+      "add x23, x27, x20\n"
       "uzp2 z16.d, z16.d, z20.d\n"
       "uzp1 z20.d, z17.d, z21.d\n"
       "uzp2 z17.d, z17.d, z21.d\n"
-      "add x21, x22, x20\n"
+      "add x22, x23, x20\n"
       "uzp1 z21.d, z18.d, z22.d\n"
       "uzp2 z18.d, z18.d, z22.d\n"
       "uzp1 z22.d, z19.d, z23.d\n"
@@ -784,170 +784,170 @@
       "uzp1 z25.d, z25.d, z29.d\n"
       "uzp1 z26.d, z26.d, z30.d\n"
       "uzp1 z27.d, z27.d, z31.d\n"
-      "mov z31.d, z7.d\n"
+      "mov z31.d, z0.d\n"
       "tbnz %x[flags], #31, 40f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1rw { z3.s }, p2/Z, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
       ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
       ".inst 0x4491a9ad  // addp z13.s, p2/m, z13.s, z13.s\n"
-      "neg z3.s, p2/M, z3.s\n"
+      "neg z23.s, p2/M, z23.s\n"
       "mov z12.s, z11.s[3]\n"
       "mov z11.s, z11.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z3.s\n"
+      "mul z11.s, p2/M, z11.s, z23.s\n"
       "mov z13.s, z13.s[0]\n"
-      "mul z12.s, p2/M, z12.s, z3.s\n"
-      "mul z13.s, p2/M, z13.s, z3.s\n"
+      "mul z12.s, p2/M, z12.s, z23.s\n"
+      "mul z13.s, p2/M, z13.s, z23.s\n"
       "40:"  // Height 3: skip row sum fixup
       "add z31.s, z31.s, z11.s\n"
       "add z20.s, z20.s, z11.s\n"
       "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z21.s, z21.s, z11.s\n"
       "add z22.s, z22.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z29.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z16.s, z16.s, z12.s\n"
       "add z17.s, z17.s, z12.s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "add z18.s, z18.s, z12.s\n"
       "add z19.s, z19.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add z24.s, z24.s, z13.s\n"
       "add z25.s, z25.s, z13.s\n"
       "addvl x10, x10, #4\n"
       "add z26.s, z26.s, z13.s\n"
       "add z27.s, z27.s, z13.s\n"
       "add z31.s, z31.s, z0.s\n"
-      "add z20.s, z20.s, z1.s\n"
-      "add z21.s, z21.s, z2.s\n"
-      "add z22.s, z22.s, z3.s\n"
+      "add z20.s, z20.s, z30.s\n"
+      "add z21.s, z21.s, z29.s\n"
+      "add z22.s, z22.s, z28.s\n"
       "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
+      "add z17.s, z17.s, z30.s\n"
+      "add z18.s, z18.s, z29.s\n"
+      "add z19.s, z19.s, z28.s\n"
       "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
-      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
-      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
-      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
-      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      "add z25.s, z25.s, z30.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z26.s, z26.s, z29.s\n"
+      "add z27.s, z27.s, z28.s\n"
+      ".inst 0x04b777ff  // sqrdmulh z31.s, z31.s, z23.s\n"
+      ".inst 0x04b77694  // sqrdmulh z20.s, z20.s, z23.s\n"
+      ".inst 0x04b776b5  // sqrdmulh z21.s, z21.s, z23.s\n"
+      ".inst 0x04b776d6  // sqrdmulh z22.s, z22.s, z23.s\n"
+      ".inst 0x04b77610  // sqrdmulh z16.s, z16.s, z23.s\n"
+      ".inst 0x04b77631  // sqrdmulh z17.s, z17.s, z23.s\n"
+      ".inst 0x04b77652  // sqrdmulh z18.s, z18.s, z23.s\n"
+      ".inst 0x04b77673  // sqrdmulh z19.s, z19.s, z23.s\n"
+      ".inst 0x04b77718  // sqrdmulh z24.s, z24.s, z23.s\n"
+      ".inst 0x04b77739  // sqrdmulh z25.s, z25.s, z23.s\n"
+      ".inst 0x04b7775a  // sqrdmulh z26.s, z26.s, z23.s\n"
+      ".inst 0x04b7777b  // sqrdmulh z27.s, z27.s, z23.s\n"
       "tbz %x[flags], #5, 41f\n"
-      "and z4.d, z31.d, z0.d\n"
-      "and z5.d, z20.d, z0.d\n"
-      "and z6.d, z21.d, z0.d\n"
-      "and z7.d, z22.d, z0.d\n"
-      "and z8.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "sqadd z31.s, z31.s, z4.s\n"
-      "sqadd z20.s, z20.s, z5.s\n"
-      "sqadd z21.s, z21.s, z6.s\n"
-      "sqadd z22.s, z22.s, z7.s\n"
-      "sqadd z16.s, z16.s, z8.s\n"
-      "and z9.d, z17.d, z0.d\n"
-      "and z10.d, z18.d, z0.d\n"
-      "and z4.d, z19.d, z0.d\n"
-      "and z5.d, z24.d, z0.d\n"
-      "and z6.d, z25.d, z0.d\n"
-      "and z7.d, z26.d, z0.d\n"
-      "and z8.d, z27.d, z0.d\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z9.s\n"
-      "sqadd z18.s, z18.s, z10.s\n"
-      "sqadd z19.s, z19.s, z4.s\n"
-      "sqadd z24.s, z24.s, z5.s\n"
-      "sqadd z25.s, z25.s, z6.s\n"
-      "sqadd z26.s, z26.s, z7.s\n"
-      "sqadd z27.s, z27.s, z8.s\n"
+      "and z1.d, z31.d, z0.d\n"
+      "and z30.d, z20.d, z0.d\n"
+      "and z29.d, z21.d, z0.d\n"
+      "and z28.d, z22.d, z0.d\n"
+      "and z23.d, z16.d, z0.d\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z1.s\n"
+      "sqadd z20.s, z20.s, z30.s\n"
+      "sqadd z21.s, z21.s, z29.s\n"
+      "sqadd z22.s, z22.s, z28.s\n"
+      "sqadd z16.s, z16.s, z23.s\n"
+      "and z3.d, z17.d, z0.d\n"
+      "and z2.d, z18.d, z0.d\n"
+      "and z1.d, z19.d, z0.d\n"
+      "and z30.d, z24.d, z0.d\n"
+      "and z29.d, z25.d, z0.d\n"
+      "and z28.d, z26.d, z0.d\n"
+      "and z23.d, z27.d, z0.d\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z3.s\n"
+      "sqadd z18.s, z18.s, z2.s\n"
+      "sqadd z19.s, z19.s, z1.s\n"
+      "sqadd z24.s, z24.s, z30.s\n"
+      "sqadd z25.s, z25.s, z29.s\n"
+      "sqadd z26.s, z26.s, z28.s\n"
+      "sqadd z27.s, z27.s, z23.s\n"
       "41:"  // Height 3: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
       ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
-      "add z31.s, z31.s, z4.s\n"
+      "add z31.s, z31.s, z23.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "add z21.s, z21.s, z4.s\n"
+      "add z20.s, z20.s, z23.s\n"
+      "add z21.s, z21.s, z23.s\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z22.s, z22.s, z23.s\n"
+      "add z16.s, z16.s, z23.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z23.s\n"
+      "add z18.s, z18.s, z23.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
+      "add z19.s, z19.s, z23.s\n"
+      "add z24.s, z24.s, z23.s\n"
       ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
       ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "add z26.s, z26.s, z4.s\n"
+      "add z25.s, z25.s, z23.s\n"
+      "add z26.s, z26.s, z23.s\n"
       ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x23]\n"
-      "add z27.s, z27.s, z4.s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x23]\n"
-      "smin z31.s, p2/M, z31.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smax z31.s, p2/M, z31.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z23.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
+      "smin z31.s, p2/M, z31.s, z28.s\n"
+      "smin z20.s, p2/M, z20.s, z28.s\n"
+      "smin z21.s, p2/M, z21.s, z28.s\n"
+      "smin z22.s, p2/M, z22.s, z28.s\n"
+      "smin z16.s, p2/M, z16.s, z28.s\n"
+      "smin z17.s, p2/M, z17.s, z28.s\n"
+      "smin z18.s, p2/M, z18.s, z28.s\n"
+      "smin z19.s, p2/M, z19.s, z28.s\n"
+      "smin z24.s, p2/M, z24.s, z28.s\n"
+      "smin z25.s, p2/M, z25.s, z28.s\n"
+      "smin z26.s, p2/M, z26.s, z28.s\n"
+      "smin z27.s, p2/M, z27.s, z28.s\n"
+      "smax z31.s, p2/M, z31.s, z23.s\n"
+      "smax z20.s, p2/M, z20.s, z23.s\n"
+      "smax z21.s, p2/M, z21.s, z23.s\n"
       "uzp1 z31.h, z31.h, z20.h\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z22.s, p2/M, z22.s, z23.s\n"
+      "smax z16.s, p2/M, z16.s, z23.s\n"
       "uzp1 z20.h, z21.h, z22.h\n"
       "uzp1 z31.b, z31.b, z20.b\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z23.s\n"
+      "smax z18.s, p2/M, z18.s, z23.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
       "st1b { z31.b }, p1, [x27]\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z23.s\n"
+      "smax z24.s, p2/M, z24.s, z23.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z25.s, p2/M, z25.s, z23.s\n"
+      "smax z26.s, p2/M, z26.s, z23.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
-      "st1b { z16.b }, p1, [x22]\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "st1b { z24.b }, p1, [x21]\n"
+      "st1b { z16.b }, p1, [x23]\n"
+      "smax z27.s, p2/M, z27.s, z23.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x22]\n"
       "addvl x27, x27, #1\n"
       "42:"  // Height 3: Writeback done
       "decw x9, ALL, MUL #4\n"
@@ -992,14 +992,14 @@
       "46:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 47f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
       "cbnz x26, 48f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -1009,9 +1009,9 @@
       "b 48f\n"
       "47:"  // Height 4: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "48:"  // Height 4: input setup done
       "cmp x25, #0x10\n"
       "ble 51f\n"
@@ -1021,63 +1021,63 @@
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
       "trn1 z0.d, z1.d, z2.d\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
       "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
-      ".inst 0x45059858  // smmla z24.s, z2.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
-      ".inst 0x4506985c  // smmla z28.s, z2.b, z6.b\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
-      ".inst 0x45079859  // smmla z25.s, z2.b, z7.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
-      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
-      ".inst 0x4508985d  // smmla z29.s, z2.b, z8.b\n"
+      "trn1 z2.d, z3.d, z5.d\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      "trn2 z3.d, z3.d, z5.d\n"
+      ".inst 0x45049810  // smmla z16.s, z0.b, z4.b\n"
+      ".inst 0x45049858  // smmla z24.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x45049814  // smmla z20.s, z0.b, z4.b\n"
+      ".inst 0x4504985c  // smmla z28.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45059811  // smmla z17.s, z0.b, z5.b\n"
+      ".inst 0x45059859  // smmla z25.s, z2.b, z5.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45049815  // smmla z21.s, z0.b, z4.b\n"
+      ".inst 0x4504985d  // smmla z29.s, z2.b, z4.b\n"
       "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
-      ".inst 0x4509985a  // smmla z26.s, z2.b, z9.b\n"
-      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      ".inst 0x450a985e  // smmla z30.s, z2.b, z10.b\n"
-      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      ".inst 0x4504985b  // smmla z27.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45089812  // smmla z18.s, z0.b, z8.b\n"
+      ".inst 0x4508985a  // smmla z26.s, z2.b, z8.b\n"
+      ".inst 0x45079816  // smmla z22.s, z0.b, z7.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x4507985e  // smmla z30.s, z2.b, z7.b\n"
+      ".inst 0x45069813  // smmla z19.s, z0.b, z6.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x4506985b  // smmla z27.s, z2.b, z6.b\n"
       ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
       "add x24, x24, #0x10\n"
       ".inst 0x4505985f  // smmla z31.s, z2.b, z5.b\n"
-      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
+      ".inst 0x45049830  // smmla z16.s, z1.b, z4.b\n"
       "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x45069878  // smmla z24.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
+      ".inst 0x45049878  // smmla z24.s, z3.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x450a9834  // smmla z20.s, z1.b, z10.b\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x4507987c  // smmla z28.s, z3.b, z7.b\n"
-      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
+      ".inst 0x450a987c  // smmla z28.s, z3.b, z10.b\n"
+      ".inst 0x45099831  // smmla z17.s, z1.b, z9.b\n"
       "add x21, x21, #0x10\n"
-      ".inst 0x45089879  // smmla z25.s, z3.b, z8.b\n"
-      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
-      ".inst 0x4509987d  // smmla z29.s, z3.b, z9.b\n"
-      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
-      ".inst 0x450a987a  // smmla z26.s, z3.b, z10.b\n"
-      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
-      ".inst 0x4504987e  // smmla z30.s, z3.b, z4.b\n"
+      ".inst 0x45099879  // smmla z25.s, z3.b, z9.b\n"
+      ".inst 0x45089835  // smmla z21.s, z1.b, z8.b\n"
+      ".inst 0x4508987d  // smmla z29.s, z3.b, z8.b\n"
+      ".inst 0x45079832  // smmla z18.s, z1.b, z7.b\n"
+      ".inst 0x4507987a  // smmla z26.s, z3.b, z7.b\n"
+      ".inst 0x45069836  // smmla z22.s, z1.b, z6.b\n"
+      ".inst 0x4506987e  // smmla z30.s, z3.b, z6.b\n"
       ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
       ".inst 0x4505987b  // smmla z27.s, z3.b, z5.b\n"
-      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
-      ".inst 0x4506987f  // smmla z31.s, z3.b, z6.b\n"
+      ".inst 0x45049837  // smmla z23.s, z1.b, z4.b\n"
+      ".inst 0x4504987f  // smmla z31.s, z3.b, z4.b\n"
       "tbnz %x[flags], #31, 50f\n"
       "sdot z11.s, z0.b, z15.b\n"
       "sdot z13.s, z2.b, z15.b\n"
@@ -1093,62 +1093,62 @@
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
       "trn1 z0.d, z1.d, z2.d\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
       "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
-      ".inst 0x45059858  // smmla z24.s, z2.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "trn1 z2.d, z3.d, z5.d\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      "trn2 z3.d, z3.d, z5.d\n"
+      ".inst 0x45049810  // smmla z16.s, z0.b, z4.b\n"
+      ".inst 0x45049858  // smmla z24.s, z2.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
       "subs x25, x25, #0x8\n"
-      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
+      ".inst 0x45059814  // smmla z20.s, z0.b, z5.b\n"
       "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      ".inst 0x4506985c  // smmla z28.s, z2.b, z6.b\n"
-      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
-      ".inst 0x45079859  // smmla z25.s, z2.b, z7.b\n"
-      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
-      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x4508985d  // smmla z29.s, z2.b, z8.b\n"
-      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
-      "addvl x28, x28, #8\n"
-      ".inst 0x4509985a  // smmla z26.s, z2.b, z9.b\n"
-      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
-      ".inst 0x450a985e  // smmla z30.s, z2.b, z10.b\n"
-      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
-      ".inst 0x4504985b  // smmla z27.s, z2.b, z4.b\n"
-      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
-      ".inst 0x4505985f  // smmla z31.s, z2.b, z5.b\n"
-      "ble 52f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
-      ".inst 0x45069878  // smmla z24.s, z3.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
-      ".inst 0x4507987c  // smmla z28.s, z3.b, z7.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
-      ".inst 0x45089879  // smmla z25.s, z3.b, z8.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4505985c  // smmla z28.s, z2.b, z5.b\n"
+      ".inst 0x45049811  // smmla z17.s, z0.b, z4.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
       "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
-      ".inst 0x4509987d  // smmla z29.s, z3.b, z9.b\n"
-      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
-      ".inst 0x450a987a  // smmla z26.s, z3.b, z10.b\n"
+      ".inst 0x45049859  // smmla z25.s, z2.b, z4.b\n"
+      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x4508985d  // smmla z29.s, z2.b, z8.b\n"
+      ".inst 0x45079812  // smmla z18.s, z0.b, z7.b\n"
       "addvl x28, x28, #8\n"
-      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
-      ".inst 0x4504987e  // smmla z30.s, z3.b, z4.b\n"
+      ".inst 0x4507985a  // smmla z26.s, z2.b, z7.b\n"
+      ".inst 0x45069816  // smmla z22.s, z0.b, z6.b\n"
+      ".inst 0x4506985e  // smmla z30.s, z2.b, z6.b\n"
+      ".inst 0x45059813  // smmla z19.s, z0.b, z5.b\n"
+      ".inst 0x4505985b  // smmla z27.s, z2.b, z5.b\n"
+      ".inst 0x45049817  // smmla z23.s, z0.b, z4.b\n"
+      ".inst 0x4504985f  // smmla z31.s, z2.b, z4.b\n"
+      "ble 52f\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      ".inst 0x45049830  // smmla z16.s, z1.b, z4.b\n"
+      ".inst 0x45049878  // smmla z24.s, z3.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45059834  // smmla z20.s, z1.b, z5.b\n"
+      ".inst 0x4505987c  // smmla z28.s, z3.b, z5.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45049831  // smmla z17.s, z1.b, z4.b\n"
+      ".inst 0x45049879  // smmla z25.s, z3.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45089835  // smmla z21.s, z1.b, z8.b\n"
+      ".inst 0x4508987d  // smmla z29.s, z3.b, z8.b\n"
+      ".inst 0x45079832  // smmla z18.s, z1.b, z7.b\n"
+      ".inst 0x4507987a  // smmla z26.s, z3.b, z7.b\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45069836  // smmla z22.s, z1.b, z6.b\n"
+      ".inst 0x4506987e  // smmla z30.s, z3.b, z6.b\n"
       ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
       ".inst 0x4505987b  // smmla z27.s, z3.b, z5.b\n"
-      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
-      ".inst 0x4506987f  // smmla z31.s, z3.b, z6.b\n"
+      ".inst 0x45049837  // smmla z23.s, z1.b, z4.b\n"
+      ".inst 0x4504987f  // smmla z31.s, z3.b, z4.b\n"
       "52:"  // Height 4: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 53f\n"
       "sdot z11.s, z0.b, z15.b\n"
@@ -1161,12 +1161,12 @@
       "cmp x26, x20\n"
       "bne 46b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "uzp1 z7.d, z16.d, z20.d\n"
-      "add x22, x27, x20\n"
-      "add x21, x22, x20\n"
+      "uzp1 z0.d, z16.d, z20.d\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
       "uzp2 z16.d, z16.d, z20.d\n"
       "uzp1 z20.d, z17.d, z21.d\n"
-      "add x20, x21, x20\n"
+      "add x21, x22, x20\n"
       "uzp2 z17.d, z17.d, z21.d\n"
       "uzp1 z21.d, z18.d, z22.d\n"
       "uzp2 z18.d, z18.d, z22.d\n"
@@ -1180,38 +1180,38 @@
       "uzp2 z26.d, z26.d, z30.d\n"
       "uzp1 z30.d, z27.d, z31.d\n"
       "uzp2 z27.d, z27.d, z31.d\n"
-      "mov z31.d, z7.d\n"
+      "mov z31.d, z0.d\n"
       "tbnz %x[flags], #31, 54f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
       ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
       ".inst 0x4491a9ad  // addp z13.s, p2/m, z13.s, z13.s\n"
-      "neg z4.s, p2/M, z4.s\n"
+      "neg z0.s, p2/M, z0.s\n"
       "mov z12.s, z11.s[3]\n"
       "mov z11.s, z11.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z4.s\n"
+      "mul z11.s, p2/M, z11.s, z0.s\n"
       "mov z14.s, z13.s[3]\n"
       "mov z13.s, z13.s[0]\n"
-      "mul z12.s, p2/M, z12.s, z4.s\n"
-      "mul z13.s, p2/M, z13.s, z4.s\n"
-      "mul z14.s, p2/M, z14.s, z4.s\n"
+      "mul z12.s, p2/M, z12.s, z0.s\n"
+      "mul z13.s, p2/M, z13.s, z0.s\n"
+      "mul z14.s, p2/M, z14.s, z0.s\n"
       "54:"  // Height 4: skip row sum fixup
       "add z31.s, z31.s, z11.s\n"
       "add z20.s, z20.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z4.s }, p2/Z, [x10]\n"
+      "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z21.s, z21.s, z11.s\n"
       "add z22.s, z22.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z16.s, z16.s, z12.s\n"
       "add z17.s, z17.s, z12.s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "add z18.s, z18.s, z12.s\n"
       "add z19.s, z19.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add z23.s, z23.s, z13.s\n"
       "add z28.s, z28.s, z13.s\n"
       "addvl x10, x10, #4\n"
@@ -1221,175 +1221,175 @@
       "add z25.s, z25.s, z14.s\n"
       "add z26.s, z26.s, z14.s\n"
       "add z27.s, z27.s, z14.s\n"
-      "add z31.s, z31.s, z0.s\n"
-      "add z20.s, z20.s, z1.s\n"
-      "add z21.s, z21.s, z2.s\n"
-      "add z22.s, z22.s, z3.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z23.s, z23.s, z0.s\n"
-      "add z28.s, z28.s, z1.s\n"
-      "add z29.s, z29.s, z2.s\n"
-      "add z30.s, z30.s, z3.s\n"
-      "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
-      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
-      ".inst 0x04a4779c  // sqrdmulh z28.s, z28.s, z4.s\n"
-      ".inst 0x04a477bd  // sqrdmulh z29.s, z29.s, z4.s\n"
-      ".inst 0x04a477de  // sqrdmulh z30.s, z30.s, z4.s\n"
-      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
-      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
-      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
-      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
-      "tbz %x[flags], #5, 55f\n"
-      "and z4.d, z31.d, z0.d\n"
-      "and z5.d, z20.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z31.s, z31.s, z4.s\n"
-      "sqadd z20.s, z20.s, z5.s\n"
-      "and z6.d, z21.d, z0.d\n"
-      "and z7.d, z22.d, z0.d\n"
-      "and z8.d, z16.d, z0.d\n"
-      "and z9.d, z17.d, z0.d\n"
-      "and z10.d, z18.d, z0.d\n"
-      "and z4.d, z19.d, z0.d\n"
-      "and z5.d, z23.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z21.s, z21.s, z6.s\n"
-      "sqadd z22.s, z22.s, z7.s\n"
-      "sqadd z16.s, z16.s, z8.s\n"
-      "sqadd z17.s, z17.s, z9.s\n"
-      "sqadd z18.s, z18.s, z10.s\n"
-      "sqadd z19.s, z19.s, z4.s\n"
-      "sqadd z23.s, z23.s, z5.s\n"
-      "and z6.d, z28.d, z0.d\n"
-      "and z7.d, z29.d, z0.d\n"
-      "and z8.d, z30.d, z0.d\n"
-      "and z9.d, z24.d, z0.d\n"
-      "and z10.d, z25.d, z0.d\n"
-      "and z4.d, z26.d, z0.d\n"
-      "and z5.d, z27.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z28.s, z28.s, z6.s\n"
-      "sqadd z29.s, z29.s, z7.s\n"
-      "sqadd z30.s, z30.s, z8.s\n"
-      "sqadd z24.s, z24.s, z9.s\n"
-      "sqadd z25.s, z25.s, z10.s\n"
-      "sqadd z26.s, z26.s, z4.s\n"
-      "sqadd z27.s, z27.s, z5.s\n"
-      "55:"  // Height 4: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
-      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
       "add z31.s, z31.s, z4.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z3.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z0.s\n"
+      "add z18.s, z18.s, z3.s\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z28.s, z28.s, z0.s\n"
+      "add z29.s, z29.s, z3.s\n"
+      "add z30.s, z30.s, z2.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z26.s, z26.s, z3.s\n"
+      "add z27.s, z27.s, z2.s\n"
+      ".inst 0x04a177ff  // sqrdmulh z31.s, z31.s, z1.s\n"
+      ".inst 0x04a17694  // sqrdmulh z20.s, z20.s, z1.s\n"
+      ".inst 0x04a176b5  // sqrdmulh z21.s, z21.s, z1.s\n"
+      ".inst 0x04a176d6  // sqrdmulh z22.s, z22.s, z1.s\n"
+      ".inst 0x04a17610  // sqrdmulh z16.s, z16.s, z1.s\n"
+      ".inst 0x04a17631  // sqrdmulh z17.s, z17.s, z1.s\n"
+      ".inst 0x04a17652  // sqrdmulh z18.s, z18.s, z1.s\n"
+      ".inst 0x04a17673  // sqrdmulh z19.s, z19.s, z1.s\n"
+      ".inst 0x04a176f7  // sqrdmulh z23.s, z23.s, z1.s\n"
+      ".inst 0x04a1779c  // sqrdmulh z28.s, z28.s, z1.s\n"
+      ".inst 0x04a177bd  // sqrdmulh z29.s, z29.s, z1.s\n"
+      ".inst 0x04a177de  // sqrdmulh z30.s, z30.s, z1.s\n"
+      ".inst 0x04a17718  // sqrdmulh z24.s, z24.s, z1.s\n"
+      ".inst 0x04a17739  // sqrdmulh z25.s, z25.s, z1.s\n"
+      ".inst 0x04a1775a  // sqrdmulh z26.s, z26.s, z1.s\n"
+      ".inst 0x04a1777b  // sqrdmulh z27.s, z27.s, z1.s\n"
+      "tbz %x[flags], #5, 55f\n"
+      "and z2.d, z31.d, z0.d\n"
+      "and z1.d, z20.d, z0.d\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z2.s\n"
+      "sqadd z20.s, z20.s, z1.s\n"
+      "and z7.d, z21.d, z0.d\n"
+      "and z6.d, z22.d, z0.d\n"
+      "and z5.d, z16.d, z0.d\n"
+      "and z4.d, z17.d, z0.d\n"
+      "and z3.d, z18.d, z0.d\n"
+      "and z2.d, z19.d, z0.d\n"
+      "and z1.d, z23.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z21.s, z21.s, z7.s\n"
+      "sqadd z22.s, z22.s, z6.s\n"
+      "sqadd z16.s, z16.s, z5.s\n"
+      "sqadd z17.s, z17.s, z4.s\n"
+      "sqadd z18.s, z18.s, z3.s\n"
+      "sqadd z19.s, z19.s, z2.s\n"
+      "sqadd z23.s, z23.s, z1.s\n"
+      "and z7.d, z28.d, z0.d\n"
+      "and z6.d, z29.d, z0.d\n"
+      "and z5.d, z30.d, z0.d\n"
+      "and z4.d, z24.d, z0.d\n"
+      "and z3.d, z25.d, z0.d\n"
+      "and z2.d, z26.d, z0.d\n"
+      "and z1.d, z27.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z28.s, z28.s, z7.s\n"
+      "sqadd z29.s, z29.s, z6.s\n"
+      "sqadd z30.s, z30.s, z5.s\n"
+      "sqadd z24.s, z24.s, z4.s\n"
+      "sqadd z25.s, z25.s, z3.s\n"
+      "sqadd z26.s, z26.s, z2.s\n"
+      "sqadd z27.s, z27.s, z1.s\n"
+      "55:"  // Height 4: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "add z31.s, z31.s, z2.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "add z21.s, z21.s, z4.s\n"
+      "add z20.s, z20.s, z2.s\n"
+      "add z21.s, z21.s, z2.s\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z16.s, z16.s, z2.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z2.s\n"
+      "add z18.s, z18.s, z2.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z23.s, z23.s, z4.s\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z23.s, z23.s, z2.s\n"
       ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
       ".inst 0x4482881d  // srshl z29.s, p2/M, z29.s, z0.s\n"
-      "add z28.s, z28.s, z4.s\n"
-      "add z29.s, z29.s, z4.s\n"
+      "add z28.s, z28.s, z2.s\n"
+      "add z29.s, z29.s, z2.s\n"
       ".inst 0x4482881e  // srshl z30.s, p2/M, z30.s, z0.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      "add z30.s, z30.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
+      "add z30.s, z30.s, z2.s\n"
+      "add z24.s, z24.s, z2.s\n"
       ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
       ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "add z26.s, z26.s, z4.s\n"
+      "add z25.s, z25.s, z2.s\n"
+      "add z26.s, z26.s, z2.s\n"
       ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x23]\n"
-      "add z27.s, z27.s, z4.s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x23]\n"
-      "smin z31.s, p2/M, z31.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z28.s, p2/M, z28.s, z6.s\n"
-      "smin z29.s, p2/M, z29.s, z6.s\n"
-      "smin z30.s, p2/M, z30.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smax z31.s, p2/M, z31.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z2.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "smin z31.s, p2/M, z31.s, z1.s\n"
+      "smin z20.s, p2/M, z20.s, z1.s\n"
+      "smin z21.s, p2/M, z21.s, z1.s\n"
+      "smin z22.s, p2/M, z22.s, z1.s\n"
+      "smin z16.s, p2/M, z16.s, z1.s\n"
+      "smin z17.s, p2/M, z17.s, z1.s\n"
+      "smin z18.s, p2/M, z18.s, z1.s\n"
+      "smin z19.s, p2/M, z19.s, z1.s\n"
+      "smin z23.s, p2/M, z23.s, z1.s\n"
+      "smin z28.s, p2/M, z28.s, z1.s\n"
+      "smin z29.s, p2/M, z29.s, z1.s\n"
+      "smin z30.s, p2/M, z30.s, z1.s\n"
+      "smin z24.s, p2/M, z24.s, z1.s\n"
+      "smin z25.s, p2/M, z25.s, z1.s\n"
+      "smin z26.s, p2/M, z26.s, z1.s\n"
+      "smin z27.s, p2/M, z27.s, z1.s\n"
+      "smax z31.s, p2/M, z31.s, z0.s\n"
+      "smax z20.s, p2/M, z20.s, z0.s\n"
+      "smax z21.s, p2/M, z21.s, z0.s\n"
       "uzp1 z31.h, z31.h, z20.h\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z22.s, p2/M, z22.s, z0.s\n"
+      "smax z16.s, p2/M, z16.s, z0.s\n"
       "uzp1 z20.h, z21.h, z22.h\n"
       "uzp1 z31.b, z31.b, z20.b\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z0.s\n"
+      "smax z18.s, p2/M, z18.s, z0.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
       "st1b { z31.b }, p1, [x27]\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z0.s\n"
+      "smax z23.s, p2/M, z23.s, z0.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z28.s, p2/M, z28.s, z5.s\n"
-      "smax z29.s, p2/M, z29.s, z5.s\n"
+      "smax z28.s, p2/M, z28.s, z0.s\n"
+      "smax z29.s, p2/M, z29.s, z0.s\n"
       "uzp1 z23.h, z23.h, z28.h\n"
-      "st1b { z16.b }, p1, [x22]\n"
-      "smax z30.s, p2/M, z30.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "uzp1 z28.h, z29.h, z30.h\n"
-      "uzp1 z23.b, z23.b, z28.b\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "st1b { z16.b }, p1, [x23]\n"
+      "smax z30.s, p2/M, z30.s, z0.s\n"
+      "smax z24.s, p2/M, z24.s, z0.s\n"
+      "uzp1 z16.h, z29.h, z30.h\n"
+      "uzp1 z23.b, z23.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z0.s\n"
+      "smax z26.s, p2/M, z26.s, z0.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
-      "st1b { z23.b }, p1, [x21]\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "st1b { z24.b }, p1, [x20]\n"
+      "st1b { z23.b }, p1, [x22]\n"
+      "smax z27.s, p2/M, z27.s, z0.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x21]\n"
       "addvl x27, x27, #1\n"
       "56:"  // Height 4: Writeback done
       "decw x9, ALL, MUL #4\n"
@@ -1407,7 +1407,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "58:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1415,4 +1414,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
index dad04c8..056ae7a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -74,7 +74,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, int8_t>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -97,5 +96,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
index 1e71806..c28717a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
@@ -113,11 +113,11 @@
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -130,101 +130,101 @@
       "7:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9]\n"
+      "sdot z8.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z9.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z10.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z11.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "sdot z8.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "sdot z9.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "sdot z10.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #16\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "sdot z11.s, z16.b, z0.b[1]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[2]\n"
+      "sdot z9.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[2]\n"
+      "sdot z11.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[3]\n"
+      "sdot z9.s, z16.b, z0.b[3]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
       "sub x27, x27, #0x10\n"
       "cmp x27, #0x10\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z10.s, z17.b, z0.b[3]\n"
+      "sdot z11.s, z16.b, z0.b[3]\n"
       "add x26, x26, #0x10\n"
       "bgt 7b\n"
       "8:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9]\n"
+      "sdot z8.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z9.s, z16.b, z0.b[0]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[0]\n"
+      "sdot z11.s, z16.b, z0.b[0]\n"
       "addvl x9, x9, #4\n"
       "ble 9f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[1]\n"
+      "sdot z9.s, z16.b, z0.b[1]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z10.s, z17.b, z0.b[1]\n"
+      "sdot z11.s, z16.b, z0.b[1]\n"
       "addvl x9, x9, #4\n"
       "ble 9f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[2]\n"
+      "sdot z9.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z10.s, z17.b, z0.b[2]\n"
+      "sdot z11.s, z16.b, z0.b[2]\n"
       "addvl x9, x9, #4\n"
       "ble 9f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[3]\n"
+      "sdot z9.s, z16.b, z0.b[3]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[3]\n"
+      "sdot z11.s, z16.b, z0.b[3]\n"
       "addvl x9, x9, #4\n"
       "9:"  // Height 1: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 4b\n"
-      "ld1w { z0.s }, p2/Z, [x14]\n"
-      "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
-      "add z8.s, z8.s, z0.s\n"
-      "add z9.s, z9.s, z1.s\n"
-      "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
+      "ld1w { z17.s }, p2/Z, [x14]\n"
+      "ld1w { z16.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "add z8.s, z8.s, z17.s\n"
+      "add z9.s, z9.s, z16.s\n"
+      "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add z10.s, z10.s, z17.s\n"
+      "add z11.s, z11.s, z16.s\n"
       "addvl x14, x14, #4\n"
       "tbz %x[flags], #4, 10f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
@@ -239,10 +239,10 @@
       "addvl x13, x13, #4\n"
       "b 11f\n"
       "10:"  // Height 1: per layer parameters
-      "add x26, %x[qp], %[per_layer_right_shift]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z0.s }, p2/Z, [x26]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
       "mov z5.d, z4.d\n"
       "mov z2.d, z0.d\n"
@@ -255,44 +255,44 @@
       ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
       ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
       "tbz %x[flags], #5, 12f\n"
-      "and z4.d, z8.d, z0.d\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "sqadd z11.s, z11.s, z7.s\n"
+      "and z19.d, z8.d, z0.d\n"
+      "and z18.d, z9.d, z1.d\n"
+      "and z17.d, z10.d, z2.d\n"
+      "and z16.d, z11.d, z3.d\n"
+      "asr z19.s, z19.s, #0x1f\n"
+      "asr z18.s, z18.s, #0x1f\n"
+      "asr z17.s, z17.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z19.s\n"
+      "sqadd z9.s, z9.s, z18.s\n"
+      "sqadd z10.s, z10.s, z17.s\n"
+      "sqadd z11.s, z11.s, z16.s\n"
       "12:"  // Height 1: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add z8.s, z8.s, z4.s\n"
+      "add z8.s, z8.s, z16.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
+      "add z9.s, z9.s, z16.s\n"
+      "add z10.s, z10.s, z16.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x25]\n"
-      "add z11.s, z11.s, z4.s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x25]\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      "add z11.s, z11.s, z16.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "smin z8.s, p2/M, z8.s, z17.s\n"
+      "smin z9.s, p2/M, z9.s, z17.s\n"
+      "smin z10.s, p2/M, z10.s, z17.s\n"
+      "smin z11.s, p2/M, z11.s, z17.s\n"
+      "smax z8.s, p2/M, z8.s, z16.s\n"
+      "smax z9.s, p2/M, z9.s, z16.s\n"
+      "smax z10.s, p2/M, z10.s, z16.s\n"
       "uzp1 z8.h, z8.h, z9.h\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
+      "smax z11.s, p2/M, z11.s, z16.s\n"
+      "uzp1 z16.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z16.b\n"
       "st1b { z8.b }, p1, [x11]\n"
       "addvl x11, x11, #1\n"
       "13:"  // Height 1: Writeback done
@@ -323,12 +323,12 @@
       "17:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 18f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 19f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -336,150 +336,150 @@
       "b 19f\n"
       "18:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
+      "add x25, x26, x21\n"
       "19:"  // Height 2: input setup done
       "cmp x27, #0x10\n"
       "ble 21f\n"
       "20:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z1.b[0]\n"
+      "sdot z12.s, z17.b, z0.b[0]\n"
+      "sdot z9.s, z16.b, z1.b[0]\n"
+      "sdot z13.s, z16.b, z0.b[0]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z1.b[0]\n"
+      "sdot z14.s, z17.b, z0.b[0]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
       "cmp x27, #0x10\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "sdot z11.s, z16.b, z1.b[0]\n"
+      "sdot z15.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "sdot z8.s, z17.b, z1.b[1]\n"
+      "sdot z12.s, z17.b, z0.b[1]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "sdot z9.s, z16.b, z1.b[1]\n"
+      "sdot z13.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z10.s, z17.b, z1.b[1]\n"
+      "sdot z14.s, z17.b, z0.b[1]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "sdot z11.s, z16.b, z1.b[1]\n"
+      "sdot z15.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      "sdot z8.s, z17.b, z1.b[2]\n"
+      "sdot z12.s, z17.b, z0.b[2]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "sdot z9.s, z16.b, z1.b[2]\n"
+      "sdot z13.s, z16.b, z0.b[2]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      "sdot z10.s, z17.b, z1.b[2]\n"
+      "sdot z14.s, z17.b, z0.b[2]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "sdot z11.s, z16.b, z1.b[2]\n"
+      "sdot z15.s, z16.b, z0.b[2]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      "sdot z8.s, z17.b, z1.b[3]\n"
+      "sdot z12.s, z17.b, z0.b[3]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "sdot z9.s, z16.b, z1.b[3]\n"
+      "sdot z13.s, z16.b, z0.b[3]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "sdot z10.s, z17.b, z1.b[3]\n"
+      "sdot z14.s, z17.b, z0.b[3]\n"
+      "sdot z11.s, z16.b, z1.b[3]\n"
+      "sdot z15.s, z16.b, z0.b[3]\n"
       "bgt 20b\n"
       "21:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z0.b }, p0/Z, [x26]\n"
       "ld1rqb { z1.b }, p0/Z, [x25]\n"
       "subs x27, x27, #0x4\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[0]\n"
+      "sdot z12.s, z17.b, z1.b[0]\n"
+      "sdot z9.s, z16.b, z0.b[0]\n"
+      "sdot z13.s, z16.b, z1.b[0]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[0]\n"
+      "sdot z14.s, z17.b, z1.b[0]\n"
       "addvl x9, x9, #4\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z11.s, z16.b, z0.b[0]\n"
+      "sdot z15.s, z16.b, z1.b[0]\n"
       "ble 22f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[1]\n"
+      "sdot z12.s, z17.b, z1.b[1]\n"
+      "sdot z9.s, z16.b, z0.b[1]\n"
+      "sdot z13.s, z16.b, z1.b[1]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z10.s, z17.b, z0.b[1]\n"
+      "sdot z14.s, z17.b, z1.b[1]\n"
       "addvl x9, x9, #4\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z11.s, z16.b, z0.b[1]\n"
+      "sdot z15.s, z16.b, z1.b[1]\n"
       "ble 22f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[2]\n"
+      "sdot z12.s, z17.b, z1.b[2]\n"
+      "sdot z9.s, z16.b, z0.b[2]\n"
+      "sdot z13.s, z16.b, z1.b[2]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z10.s, z17.b, z0.b[2]\n"
+      "sdot z14.s, z17.b, z1.b[2]\n"
       "addvl x9, x9, #4\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z11.s, z16.b, z0.b[2]\n"
+      "sdot z15.s, z16.b, z1.b[2]\n"
       "ble 22f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[3]\n"
+      "sdot z12.s, z17.b, z1.b[3]\n"
+      "sdot z9.s, z16.b, z0.b[3]\n"
+      "sdot z13.s, z16.b, z1.b[3]\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[3]\n"
+      "sdot z14.s, z17.b, z1.b[3]\n"
       "addvl x9, x9, #4\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z11.s, z16.b, z0.b[3]\n"
+      "sdot z15.s, z16.b, z1.b[3]\n"
       "22:"  // Height 2: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 17b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z0.s }, p2/Z, [x14]\n"
-      "add x24, x11, x20\n"
-      "add z8.s, z8.s, z0.s\n"
-      "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
-      "add z9.s, z9.s, z1.s\n"
-      "add z10.s, z10.s, z2.s\n"
-      "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
-      "add z11.s, z11.s, z3.s\n"
-      "add z12.s, z12.s, z0.s\n"
+      "ld1w { z19.s }, p2/Z, [x14]\n"
+      "add x26, x11, x20\n"
+      "add z8.s, z8.s, z19.s\n"
+      "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "add z9.s, z9.s, z18.s\n"
+      "add z10.s, z10.s, z17.s\n"
+      "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add z11.s, z11.s, z16.s\n"
+      "add z12.s, z12.s, z19.s\n"
       "addvl x14, x14, #4\n"
-      "add z13.s, z13.s, z1.s\n"
-      "add z14.s, z14.s, z2.s\n"
-      "add z15.s, z15.s, z3.s\n"
+      "add z13.s, z13.s, z18.s\n"
+      "add z14.s, z14.s, z17.s\n"
+      "add z15.s, z15.s, z16.s\n"
       "tbz %x[flags], #4, 23f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -493,10 +493,10 @@
       "addvl x13, x13, #4\n"
       "b 24f\n"
       "23:"  // Height 2: per layer parameters
-      "add x26, %x[qp], %[per_layer_right_shift]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z0.s }, p2/Z, [x26]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
       "mov z5.d, z4.d\n"
       "mov z2.d, z0.d\n"
@@ -513,77 +513,77 @@
       ".inst 0x04a675ce  // sqrdmulh z14.s, z14.s, z6.s\n"
       ".inst 0x04a775ef  // sqrdmulh z15.s, z15.s, z7.s\n"
       "tbz %x[flags], #5, 25f\n"
-      "and z4.d, z8.d, z0.d\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "sqadd z11.s, z11.s, z7.s\n"
-      "and z4.d, z12.d, z0.d\n"
-      "and z5.d, z13.d, z1.d\n"
-      "and z6.d, z14.d, z2.d\n"
-      "and z7.d, z15.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z12.s, z12.s, z4.s\n"
-      "sqadd z13.s, z13.s, z5.s\n"
-      "sqadd z14.s, z14.s, z6.s\n"
-      "sqadd z15.s, z15.s, z7.s\n"
+      "and z19.d, z8.d, z0.d\n"
+      "and z18.d, z9.d, z1.d\n"
+      "and z17.d, z10.d, z2.d\n"
+      "and z16.d, z11.d, z3.d\n"
+      "asr z19.s, z19.s, #0x1f\n"
+      "asr z18.s, z18.s, #0x1f\n"
+      "asr z17.s, z17.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z19.s\n"
+      "sqadd z9.s, z9.s, z18.s\n"
+      "sqadd z10.s, z10.s, z17.s\n"
+      "sqadd z11.s, z11.s, z16.s\n"
+      "and z19.d, z12.d, z0.d\n"
+      "and z18.d, z13.d, z1.d\n"
+      "and z17.d, z14.d, z2.d\n"
+      "and z16.d, z15.d, z3.d\n"
+      "asr z19.s, z19.s, #0x1f\n"
+      "asr z18.s, z18.s, #0x1f\n"
+      "asr z17.s, z17.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z19.s\n"
+      "sqadd z13.s, z13.s, z18.s\n"
+      "sqadd z14.s, z14.s, z17.s\n"
+      "sqadd z15.s, z15.s, z16.s\n"
       "25:"  // Height 2: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add z8.s, z8.s, z4.s\n"
+      "add z8.s, z8.s, z17.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
+      "add z9.s, z9.s, z17.s\n"
+      "add z10.s, z10.s, z17.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
       ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
-      "add z11.s, z11.s, z4.s\n"
-      "add z12.s, z12.s, z4.s\n"
+      "add z11.s, z11.s, z17.s\n"
+      "add z12.s, z12.s, z17.s\n"
       ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
       ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
-      "add z13.s, z13.s, z4.s\n"
-      "add z14.s, z14.s, z4.s\n"
+      "add z13.s, z13.s, z17.s\n"
+      "add z14.s, z14.s, z17.s\n"
       ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x25]\n"
-      "add z15.s, z15.s, z4.s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x25]\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "add z15.s, z15.s, z17.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      "smin z8.s, p2/M, z8.s, z16.s\n"
+      "smin z9.s, p2/M, z9.s, z16.s\n"
+      "smin z10.s, p2/M, z10.s, z16.s\n"
+      "smin z11.s, p2/M, z11.s, z16.s\n"
+      "smin z12.s, p2/M, z12.s, z16.s\n"
+      "smin z13.s, p2/M, z13.s, z16.s\n"
+      "smin z14.s, p2/M, z14.s, z16.s\n"
+      "smin z15.s, p2/M, z15.s, z16.s\n"
+      "smax z8.s, p2/M, z8.s, z17.s\n"
+      "smax z9.s, p2/M, z9.s, z17.s\n"
+      "smax z10.s, p2/M, z10.s, z17.s\n"
       "uzp1 z8.h, z8.h, z9.h\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z17.s\n"
+      "smax z12.s, p2/M, z12.s, z17.s\n"
+      "uzp1 z16.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z16.b\n"
+      "smax z13.s, p2/M, z13.s, z17.s\n"
+      "smax z14.s, p2/M, z14.s, z17.s\n"
       "uzp1 z12.h, z12.h, z13.h\n"
       "st1b { z8.b }, p1, [x11]\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "st1b { z12.b }, p1, [x24]\n"
+      "smax z15.s, p2/M, z15.s, z17.s\n"
+      "uzp1 z16.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z16.b\n"
+      "st1b { z12.b }, p1, [x26]\n"
       "addvl x11, x11, #1\n"
       "26:"  // Height 2: Writeback done
       "decw x10, ALL, MUL #4\n"
@@ -617,13 +617,13 @@
       "30:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 31f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 32f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -632,86 +632,86 @@
       "b 32f\n"
       "31:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "32:"  // Height 3: input setup done
       "cmp x27, #0x10\n"
       "ble 34f\n"
       "33:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
       "ld1rqb { z1.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
-      "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1b { z21.b }, p2/Z, [x9]\n"
+      "sdot z8.s, z21.b, z2.b[0]\n"
+      "sdot z12.s, z21.b, z1.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z16.s, z21.b, z0.b[0]\n"
+      "sdot z9.s, z20.b, z2.b[0]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[0]\n"
+      "sdot z17.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
       "cmp x27, #0x10\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z10.s, z21.b, z2.b[0]\n"
+      "sdot z14.s, z21.b, z1.b[0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "sdot z18.s, z21.b, z0.b[0]\n"
+      "sdot z11.s, z20.b, z2.b[0]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "sdot z15.s, z20.b, z1.b[0]\n"
+      "sdot z19.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "sdot z8.s, z21.b, z2.b[1]\n"
+      "sdot z12.s, z21.b, z1.b[1]\n"
+      "sdot z16.s, z21.b, z0.b[1]\n"
+      "sdot z9.s, z20.b, z2.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[1]\n"
+      "sdot z17.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z10.s, z21.b, z2.b[1]\n"
+      "sdot z14.s, z21.b, z1.b[1]\n"
+      "sdot z18.s, z21.b, z0.b[1]\n"
+      "sdot z11.s, z20.b, z2.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "sdot z15.s, z20.b, z1.b[1]\n"
+      "sdot z19.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      "sdot z8.s, z21.b, z2.b[2]\n"
+      "sdot z12.s, z21.b, z1.b[2]\n"
+      "sdot z16.s, z21.b, z0.b[2]\n"
+      "sdot z9.s, z20.b, z2.b[2]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[2]\n"
+      "sdot z17.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      "sdot z10.s, z21.b, z2.b[2]\n"
+      "sdot z14.s, z21.b, z1.b[2]\n"
+      "sdot z18.s, z21.b, z0.b[2]\n"
+      "sdot z11.s, z20.b, z2.b[2]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "sdot z15.s, z20.b, z1.b[2]\n"
+      "sdot z19.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      "sdot z8.s, z21.b, z2.b[3]\n"
+      "sdot z12.s, z21.b, z1.b[3]\n"
+      "sdot z16.s, z21.b, z0.b[3]\n"
+      "sdot z9.s, z20.b, z2.b[3]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[3]\n"
+      "sdot z17.s, z20.b, z0.b[3]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "sdot z10.s, z21.b, z2.b[3]\n"
+      "sdot z14.s, z21.b, z1.b[3]\n"
+      "sdot z18.s, z21.b, z0.b[3]\n"
+      "sdot z11.s, z20.b, z2.b[3]\n"
+      "sdot z15.s, z20.b, z1.b[3]\n"
+      "sdot z19.s, z20.b, z0.b[3]\n"
       "bgt 33b\n"
       "34:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
@@ -719,104 +719,104 @@
       "ld1rqb { z1.b }, p0/Z, [x25]\n"
       "subs x27, x27, #0x4\n"
       "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x9]\n"
+      "sdot z8.s, z21.b, z0.b[0]\n"
+      "sdot z12.s, z21.b, z1.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z16.s, z21.b, z2.b[0]\n"
+      "sdot z9.s, z20.b, z0.b[0]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[0]\n"
+      "sdot z17.s, z20.b, z2.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z10.s, z21.b, z0.b[0]\n"
+      "sdot z14.s, z21.b, z1.b[0]\n"
+      "sdot z18.s, z21.b, z2.b[0]\n"
+      "sdot z11.s, z20.b, z0.b[0]\n"
+      "sdot z15.s, z20.b, z1.b[0]\n"
+      "sdot z19.s, z20.b, z2.b[0]\n"
       "ble 35f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x9]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z21.b, z0.b[1]\n"
+      "sdot z12.s, z21.b, z1.b[1]\n"
+      "sdot z16.s, z21.b, z2.b[1]\n"
+      "sdot z9.s, z20.b, z0.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[1]\n"
+      "sdot z17.s, z20.b, z2.b[1]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z10.s, z21.b, z0.b[1]\n"
+      "sdot z14.s, z21.b, z1.b[1]\n"
+      "sdot z18.s, z21.b, z2.b[1]\n"
+      "sdot z11.s, z20.b, z0.b[1]\n"
+      "sdot z15.s, z20.b, z1.b[1]\n"
+      "sdot z19.s, z20.b, z2.b[1]\n"
       "ble 35f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x9]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z21.b, z0.b[2]\n"
+      "sdot z12.s, z21.b, z1.b[2]\n"
+      "sdot z16.s, z21.b, z2.b[2]\n"
+      "sdot z9.s, z20.b, z0.b[2]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[2]\n"
+      "sdot z17.s, z20.b, z2.b[2]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z10.s, z21.b, z0.b[2]\n"
+      "sdot z14.s, z21.b, z1.b[2]\n"
+      "sdot z18.s, z21.b, z2.b[2]\n"
+      "sdot z11.s, z20.b, z0.b[2]\n"
+      "sdot z15.s, z20.b, z1.b[2]\n"
+      "sdot z19.s, z20.b, z2.b[2]\n"
       "ble 35f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x9]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z21.b, z0.b[3]\n"
+      "sdot z12.s, z21.b, z1.b[3]\n"
+      "sdot z16.s, z21.b, z2.b[3]\n"
+      "sdot z9.s, z20.b, z0.b[3]\n"
+      "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[3]\n"
+      "sdot z17.s, z20.b, z2.b[3]\n"
+      "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z10.s, z21.b, z0.b[3]\n"
+      "sdot z14.s, z21.b, z1.b[3]\n"
+      "sdot z18.s, z21.b, z2.b[3]\n"
+      "sdot z11.s, z20.b, z0.b[3]\n"
+      "sdot z15.s, z20.b, z1.b[3]\n"
+      "sdot z19.s, z20.b, z2.b[3]\n"
       "35:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 30b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z0.s }, p2/Z, [x14]\n"
-      "add x24, x11, x20\n"
-      "add x23, x24, x20\n"
-      "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
-      "add z8.s, z8.s, z0.s\n"
-      "add z9.s, z9.s, z1.s\n"
-      "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
+      "ld1w { z23.s }, p2/Z, [x14]\n"
+      "add x26, x11, x20\n"
+      "add x25, x26, x20\n"
+      "ld1w { z22.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "add z8.s, z8.s, z23.s\n"
+      "add z9.s, z9.s, z22.s\n"
+      "ld1w { z20.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add z10.s, z10.s, z21.s\n"
+      "add z11.s, z11.s, z20.s\n"
       "addvl x14, x14, #4\n"
-      "add z12.s, z12.s, z0.s\n"
-      "add z13.s, z13.s, z1.s\n"
-      "add z14.s, z14.s, z2.s\n"
-      "add z15.s, z15.s, z3.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
+      "add z12.s, z12.s, z23.s\n"
+      "add z13.s, z13.s, z22.s\n"
+      "add z14.s, z14.s, z21.s\n"
+      "add z15.s, z15.s, z20.s\n"
+      "add z16.s, z16.s, z23.s\n"
+      "add z17.s, z17.s, z22.s\n"
+      "add z18.s, z18.s, z21.s\n"
+      "add z19.s, z19.s, z20.s\n"
       "tbz %x[flags], #4, 36f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -830,10 +830,10 @@
       "addvl x13, x13, #4\n"
       "b 37f\n"
       "36:"  // Height 3: per layer parameters
-      "add x26, %x[qp], %[per_layer_right_shift]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z0.s }, p2/Z, [x26]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
       "mov z5.d, z4.d\n"
       "mov z2.d, z0.d\n"
@@ -854,109 +854,109 @@
       ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
       ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
       "tbz %x[flags], #5, 38f\n"
-      "and z4.d, z8.d, z0.d\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "sqadd z11.s, z11.s, z7.s\n"
-      "and z4.d, z12.d, z0.d\n"
-      "and z5.d, z13.d, z1.d\n"
-      "and z6.d, z14.d, z2.d\n"
-      "and z7.d, z15.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z12.s, z12.s, z4.s\n"
-      "sqadd z13.s, z13.s, z5.s\n"
-      "sqadd z14.s, z14.s, z6.s\n"
-      "sqadd z15.s, z15.s, z7.s\n"
-      "and z4.d, z16.d, z0.d\n"
-      "and z5.d, z17.d, z1.d\n"
-      "and z6.d, z18.d, z2.d\n"
-      "and z7.d, z19.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
+      "and z23.d, z8.d, z0.d\n"
+      "and z22.d, z9.d, z1.d\n"
+      "and z21.d, z10.d, z2.d\n"
+      "and z20.d, z11.d, z3.d\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z23.s\n"
+      "sqadd z9.s, z9.s, z22.s\n"
+      "sqadd z10.s, z10.s, z21.s\n"
+      "sqadd z11.s, z11.s, z20.s\n"
+      "and z23.d, z12.d, z0.d\n"
+      "and z22.d, z13.d, z1.d\n"
+      "and z21.d, z14.d, z2.d\n"
+      "and z20.d, z15.d, z3.d\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z23.s\n"
+      "sqadd z13.s, z13.s, z22.s\n"
+      "sqadd z14.s, z14.s, z21.s\n"
+      "sqadd z15.s, z15.s, z20.s\n"
+      "and z23.d, z16.d, z0.d\n"
+      "and z22.d, z17.d, z1.d\n"
+      "and z21.d, z18.d, z2.d\n"
+      "and z20.d, z19.d, z3.d\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z23.s\n"
+      "sqadd z17.s, z17.s, z22.s\n"
+      "sqadd z18.s, z18.s, z21.s\n"
+      "sqadd z19.s, z19.s, z20.s\n"
       "38:"  // Height 3: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add z8.s, z8.s, z4.s\n"
+      "add z8.s, z8.s, z21.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
+      "add z9.s, z9.s, z21.s\n"
+      "add z10.s, z10.s, z21.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
       ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
-      "add z11.s, z11.s, z4.s\n"
-      "add z12.s, z12.s, z4.s\n"
+      "add z11.s, z11.s, z21.s\n"
+      "add z12.s, z12.s, z21.s\n"
       ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
       ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
-      "add z13.s, z13.s, z4.s\n"
-      "add z14.s, z14.s, z4.s\n"
+      "add z13.s, z13.s, z21.s\n"
+      "add z14.s, z14.s, z21.s\n"
       ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z15.s, z15.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z15.s, z15.s, z21.s\n"
+      "add z16.s, z16.s, z21.s\n"
       ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
       ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z21.s\n"
+      "add z18.s, z18.s, z21.s\n"
       ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x25]\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x25]\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z21.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      "smin z8.s, p2/M, z8.s, z20.s\n"
+      "smin z9.s, p2/M, z9.s, z20.s\n"
+      "smin z10.s, p2/M, z10.s, z20.s\n"
+      "smin z11.s, p2/M, z11.s, z20.s\n"
+      "smin z12.s, p2/M, z12.s, z20.s\n"
+      "smin z13.s, p2/M, z13.s, z20.s\n"
+      "smin z14.s, p2/M, z14.s, z20.s\n"
+      "smin z15.s, p2/M, z15.s, z20.s\n"
+      "smin z16.s, p2/M, z16.s, z20.s\n"
+      "smin z17.s, p2/M, z17.s, z20.s\n"
+      "smin z18.s, p2/M, z18.s, z20.s\n"
+      "smin z19.s, p2/M, z19.s, z20.s\n"
+      "smax z8.s, p2/M, z8.s, z21.s\n"
+      "smax z9.s, p2/M, z9.s, z21.s\n"
+      "smax z10.s, p2/M, z10.s, z21.s\n"
       "uzp1 z8.h, z8.h, z9.h\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z21.s\n"
+      "smax z12.s, p2/M, z12.s, z21.s\n"
+      "uzp1 z20.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z20.b\n"
+      "smax z13.s, p2/M, z13.s, z21.s\n"
+      "smax z14.s, p2/M, z14.s, z21.s\n"
       "uzp1 z12.h, z12.h, z13.h\n"
       "st1b { z8.b }, p1, [x11]\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z21.s\n"
+      "smax z16.s, p2/M, z16.s, z21.s\n"
+      "uzp1 z20.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z20.b\n"
+      "smax z17.s, p2/M, z17.s, z21.s\n"
+      "smax z18.s, p2/M, z18.s, z21.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "st1b { z12.b }, p1, [x24]\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "st1b { z12.b }, p1, [x26]\n"
+      "smax z19.s, p2/M, z19.s, z21.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x23]\n"
+      "st1b { z16.b }, p1, [x25]\n"
       "addvl x11, x11, #1\n"
       "39:"  // Height 3: Writeback done
       "decw x10, ALL, MUL #4\n"
@@ -994,14 +994,14 @@
       "43:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 44f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 45f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1011,105 +1011,105 @@
       "b 45f\n"
       "44:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "45:"  // Height 4: input setup done
       "cmp x27, #0x10\n"
       "ble 47f\n"
       "46:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z3.b }, p0/Z, [x26]\n"
+      "ld1rqb { z2.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
-      "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z0.b }, p0/Z, [x23]\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[0]\n"
+      "sdot z12.s, z25.b, z2.b[0]\n"
+      "sdot z16.s, z25.b, z1.b[0]\n"
+      "sdot z20.s, z25.b, z0.b[0]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
+      "sdot z9.s, z24.b, z3.b[0]\n"
+      "sdot z13.s, z24.b, z2.b[0]\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "sdot z17.s, z24.b, z1.b[0]\n"
+      "sdot z21.s, z24.b, z0.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z25.b, z3.b[0]\n"
+      "sdot z14.s, z25.b, z2.b[0]\n"
+      "sdot z18.s, z25.b, z1.b[0]\n"
+      "sdot z22.s, z25.b, z0.b[0]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "sdot z11.s, z24.b, z3.b[0]\n"
+      "sdot z15.s, z24.b, z2.b[0]\n"
+      "sdot z19.s, z24.b, z1.b[0]\n"
+      "sdot z23.s, z24.b, z0.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[1]\n"
+      "sdot z12.s, z25.b, z2.b[1]\n"
+      "sdot z16.s, z25.b, z1.b[1]\n"
+      "sdot z20.s, z25.b, z0.b[1]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "sdot z9.s, z24.b, z3.b[1]\n"
+      "sdot z13.s, z24.b, z2.b[1]\n"
+      "sdot z17.s, z24.b, z1.b[1]\n"
+      "sdot z21.s, z24.b, z0.b[1]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z10.s, z25.b, z3.b[1]\n"
+      "sdot z14.s, z25.b, z2.b[1]\n"
+      "sdot z18.s, z25.b, z1.b[1]\n"
+      "sdot z22.s, z25.b, z0.b[1]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "sdot z11.s, z24.b, z3.b[1]\n"
+      "sdot z15.s, z24.b, z2.b[1]\n"
+      "sdot z19.s, z24.b, z1.b[1]\n"
+      "sdot z23.s, z24.b, z0.b[1]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[2]\n"
+      "sdot z12.s, z25.b, z2.b[2]\n"
+      "sdot z16.s, z25.b, z1.b[2]\n"
+      "sdot z20.s, z25.b, z0.b[2]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "sdot z9.s, z24.b, z3.b[2]\n"
+      "sdot z13.s, z24.b, z2.b[2]\n"
+      "sdot z17.s, z24.b, z1.b[2]\n"
+      "sdot z21.s, z24.b, z0.b[2]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      "sdot z10.s, z25.b, z3.b[2]\n"
+      "sdot z14.s, z25.b, z2.b[2]\n"
+      "sdot z18.s, z25.b, z1.b[2]\n"
+      "sdot z22.s, z25.b, z0.b[2]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "sdot z11.s, z24.b, z3.b[2]\n"
+      "sdot z15.s, z24.b, z2.b[2]\n"
+      "sdot z19.s, z24.b, z1.b[2]\n"
+      "sdot z23.s, z24.b, z0.b[2]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[3]\n"
+      "sdot z12.s, z25.b, z2.b[3]\n"
+      "sdot z16.s, z25.b, z1.b[3]\n"
+      "sdot z20.s, z25.b, z0.b[3]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "sdot z9.s, z24.b, z3.b[3]\n"
+      "sdot z13.s, z24.b, z2.b[3]\n"
+      "sdot z17.s, z24.b, z1.b[3]\n"
+      "sdot z21.s, z24.b, z0.b[3]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "sdot z10.s, z25.b, z3.b[3]\n"
+      "sdot z14.s, z25.b, z2.b[3]\n"
+      "sdot z18.s, z25.b, z1.b[3]\n"
+      "sdot z22.s, z25.b, z0.b[3]\n"
+      "sdot z11.s, z24.b, z3.b[3]\n"
+      "sdot z15.s, z24.b, z2.b[3]\n"
+      "sdot z19.s, z24.b, z1.b[3]\n"
+      "sdot z23.s, z24.b, z0.b[3]\n"
       "bgt 46b\n"
       "47:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
@@ -1118,125 +1118,125 @@
       "subs x27, x27, #0x4\n"
       "ld1rqb { z2.b }, p0/Z, [x24]\n"
       "ld1rqb { z3.b }, p0/Z, [x23]\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[0]\n"
+      "sdot z12.s, z25.b, z1.b[0]\n"
+      "sdot z16.s, z25.b, z2.b[0]\n"
+      "sdot z20.s, z25.b, z3.b[0]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z9.s, z24.b, z0.b[0]\n"
+      "sdot z13.s, z24.b, z1.b[0]\n"
+      "sdot z17.s, z24.b, z2.b[0]\n"
+      "sdot z21.s, z24.b, z3.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z10.s, z25.b, z0.b[0]\n"
+      "sdot z14.s, z25.b, z1.b[0]\n"
+      "sdot z18.s, z25.b, z2.b[0]\n"
+      "sdot z22.s, z25.b, z3.b[0]\n"
+      "sdot z11.s, z24.b, z0.b[0]\n"
+      "sdot z15.s, z24.b, z1.b[0]\n"
+      "sdot z19.s, z24.b, z2.b[0]\n"
+      "sdot z23.s, z24.b, z3.b[0]\n"
       "ble 48f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[1]\n"
+      "sdot z12.s, z25.b, z1.b[1]\n"
+      "sdot z16.s, z25.b, z2.b[1]\n"
+      "sdot z20.s, z25.b, z3.b[1]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z9.s, z24.b, z0.b[1]\n"
+      "sdot z13.s, z24.b, z1.b[1]\n"
+      "sdot z17.s, z24.b, z2.b[1]\n"
+      "sdot z21.s, z24.b, z3.b[1]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z10.s, z25.b, z0.b[1]\n"
+      "sdot z14.s, z25.b, z1.b[1]\n"
+      "sdot z18.s, z25.b, z2.b[1]\n"
+      "sdot z22.s, z25.b, z3.b[1]\n"
+      "sdot z11.s, z24.b, z0.b[1]\n"
+      "sdot z15.s, z24.b, z1.b[1]\n"
+      "sdot z19.s, z24.b, z2.b[1]\n"
+      "sdot z23.s, z24.b, z3.b[1]\n"
       "ble 48f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[2]\n"
+      "sdot z12.s, z25.b, z1.b[2]\n"
+      "sdot z16.s, z25.b, z2.b[2]\n"
+      "sdot z20.s, z25.b, z3.b[2]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z9.s, z24.b, z0.b[2]\n"
+      "sdot z13.s, z24.b, z1.b[2]\n"
+      "sdot z17.s, z24.b, z2.b[2]\n"
+      "sdot z21.s, z24.b, z3.b[2]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z10.s, z25.b, z0.b[2]\n"
+      "sdot z14.s, z25.b, z1.b[2]\n"
+      "sdot z18.s, z25.b, z2.b[2]\n"
+      "sdot z22.s, z25.b, z3.b[2]\n"
+      "sdot z11.s, z24.b, z0.b[2]\n"
+      "sdot z15.s, z24.b, z1.b[2]\n"
+      "sdot z19.s, z24.b, z2.b[2]\n"
+      "sdot z23.s, z24.b, z3.b[2]\n"
       "ble 48f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[3]\n"
+      "sdot z12.s, z25.b, z1.b[3]\n"
+      "sdot z16.s, z25.b, z2.b[3]\n"
+      "sdot z20.s, z25.b, z3.b[3]\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z9.s, z24.b, z0.b[3]\n"
+      "sdot z13.s, z24.b, z1.b[3]\n"
+      "sdot z17.s, z24.b, z2.b[3]\n"
+      "sdot z21.s, z24.b, z3.b[3]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z10.s, z25.b, z0.b[3]\n"
+      "sdot z14.s, z25.b, z1.b[3]\n"
+      "sdot z18.s, z25.b, z2.b[3]\n"
+      "sdot z22.s, z25.b, z3.b[3]\n"
+      "sdot z11.s, z24.b, z0.b[3]\n"
+      "sdot z15.s, z24.b, z1.b[3]\n"
+      "sdot z19.s, z24.b, z2.b[3]\n"
+      "sdot z23.s, z24.b, z3.b[3]\n"
       "48:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 43b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z0.s }, p2/Z, [x14]\n"
-      "add x24, x11, x20\n"
-      "add x23, x24, x20\n"
-      "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
-      "add x22, x23, x20\n"
-      "add z8.s, z8.s, z0.s\n"
-      "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
-      "add z9.s, z9.s, z1.s\n"
-      "add z10.s, z10.s, z2.s\n"
+      "ld1w { z27.s }, p2/Z, [x14]\n"
+      "add x26, x11, x20\n"
+      "add x25, x26, x20\n"
+      "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "add x24, x25, x20\n"
+      "add z8.s, z8.s, z27.s\n"
+      "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add z9.s, z9.s, z26.s\n"
+      "add z10.s, z10.s, z25.s\n"
       "addvl x14, x14, #4\n"
-      "add z11.s, z11.s, z3.s\n"
-      "add z12.s, z12.s, z0.s\n"
-      "add z13.s, z13.s, z1.s\n"
-      "add z14.s, z14.s, z2.s\n"
-      "add z15.s, z15.s, z3.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
+      "add z11.s, z11.s, z24.s\n"
+      "add z12.s, z12.s, z27.s\n"
+      "add z13.s, z13.s, z26.s\n"
+      "add z14.s, z14.s, z25.s\n"
+      "add z15.s, z15.s, z24.s\n"
+      "add z16.s, z16.s, z27.s\n"
+      "add z17.s, z17.s, z26.s\n"
+      "add z18.s, z18.s, z25.s\n"
+      "add z19.s, z19.s, z24.s\n"
+      "add z20.s, z20.s, z27.s\n"
+      "add z21.s, z21.s, z26.s\n"
+      "add z22.s, z22.s, z25.s\n"
+      "add z23.s, z23.s, z24.s\n"
       "tbz %x[flags], #4, 49f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -1250,10 +1250,10 @@
       "addvl x13, x13, #4\n"
       "b 50f\n"
       "49:"  // Height 4: per layer parameters
-      "add x26, %x[qp], %[per_layer_right_shift]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z0.s }, p2/Z, [x26]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
       "mov z5.d, z4.d\n"
       "mov z2.d, z0.d\n"
@@ -1278,141 +1278,141 @@
       ".inst 0x04a676d6  // sqrdmulh z22.s, z22.s, z6.s\n"
       ".inst 0x04a776f7  // sqrdmulh z23.s, z23.s, z7.s\n"
       "tbz %x[flags], #5, 51f\n"
-      "and z4.d, z8.d, z0.d\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "sqadd z11.s, z11.s, z7.s\n"
-      "and z4.d, z12.d, z0.d\n"
-      "and z5.d, z13.d, z1.d\n"
-      "and z6.d, z14.d, z2.d\n"
-      "and z7.d, z15.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z12.s, z12.s, z4.s\n"
-      "sqadd z13.s, z13.s, z5.s\n"
-      "sqadd z14.s, z14.s, z6.s\n"
-      "sqadd z15.s, z15.s, z7.s\n"
-      "and z4.d, z16.d, z0.d\n"
-      "and z5.d, z17.d, z1.d\n"
-      "and z6.d, z18.d, z2.d\n"
-      "and z7.d, z19.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "and z4.d, z20.d, z0.d\n"
-      "and z5.d, z21.d, z1.d\n"
-      "and z6.d, z22.d, z2.d\n"
-      "and z7.d, z23.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z20.s, z20.s, z4.s\n"
-      "sqadd z21.s, z21.s, z5.s\n"
-      "sqadd z22.s, z22.s, z6.s\n"
-      "sqadd z23.s, z23.s, z7.s\n"
+      "and z27.d, z8.d, z0.d\n"
+      "and z26.d, z9.d, z1.d\n"
+      "and z25.d, z10.d, z2.d\n"
+      "and z24.d, z11.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z27.s\n"
+      "sqadd z9.s, z9.s, z26.s\n"
+      "sqadd z10.s, z10.s, z25.s\n"
+      "sqadd z11.s, z11.s, z24.s\n"
+      "and z27.d, z12.d, z0.d\n"
+      "and z26.d, z13.d, z1.d\n"
+      "and z25.d, z14.d, z2.d\n"
+      "and z24.d, z15.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z27.s\n"
+      "sqadd z13.s, z13.s, z26.s\n"
+      "sqadd z14.s, z14.s, z25.s\n"
+      "sqadd z15.s, z15.s, z24.s\n"
+      "and z27.d, z16.d, z0.d\n"
+      "and z26.d, z17.d, z1.d\n"
+      "and z25.d, z18.d, z2.d\n"
+      "and z24.d, z19.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z27.s\n"
+      "sqadd z17.s, z17.s, z26.s\n"
+      "sqadd z18.s, z18.s, z25.s\n"
+      "sqadd z19.s, z19.s, z24.s\n"
+      "and z27.d, z20.d, z0.d\n"
+      "and z26.d, z21.d, z1.d\n"
+      "and z25.d, z22.d, z2.d\n"
+      "and z24.d, z23.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z27.s\n"
+      "sqadd z21.s, z21.s, z26.s\n"
+      "sqadd z22.s, z22.s, z25.s\n"
+      "sqadd z23.s, z23.s, z24.s\n"
       "51:"  // Height 4: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add z8.s, z8.s, z4.s\n"
+      "add z8.s, z8.s, z25.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
+      "add z9.s, z9.s, z25.s\n"
+      "add z10.s, z10.s, z25.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
       ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
-      "add z11.s, z11.s, z4.s\n"
-      "add z12.s, z12.s, z4.s\n"
+      "add z11.s, z11.s, z25.s\n"
+      "add z12.s, z12.s, z25.s\n"
       ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
       ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
-      "add z13.s, z13.s, z4.s\n"
-      "add z14.s, z14.s, z4.s\n"
+      "add z13.s, z13.s, z25.s\n"
+      "add z14.s, z14.s, z25.s\n"
       ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z15.s, z15.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z15.s, z15.s, z25.s\n"
+      "add z16.s, z16.s, z25.s\n"
       ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
       ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z25.s\n"
+      "add z18.s, z18.s, z25.s\n"
       ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z20.s, z20.s, z4.s\n"
+      "add z19.s, z19.s, z25.s\n"
+      "add z20.s, z20.s, z25.s\n"
       ".inst 0x44828835  // srshl z21.s, p2/M, z21.s, z1.s\n"
       ".inst 0x44828856  // srshl z22.s, p2/M, z22.s, z2.s\n"
-      "add z21.s, z21.s, z4.s\n"
-      "add z22.s, z22.s, z4.s\n"
+      "add z21.s, z21.s, z25.s\n"
+      "add z22.s, z22.s, z25.s\n"
       ".inst 0x44828877  // srshl z23.s, p2/M, z23.s, z3.s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x25]\n"
-      "add z23.s, z23.s, z4.s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x25]\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "add z23.s, z23.s, z25.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
+      "smin z8.s, p2/M, z8.s, z24.s\n"
+      "smin z9.s, p2/M, z9.s, z24.s\n"
+      "smin z10.s, p2/M, z10.s, z24.s\n"
+      "smin z11.s, p2/M, z11.s, z24.s\n"
+      "smin z12.s, p2/M, z12.s, z24.s\n"
+      "smin z13.s, p2/M, z13.s, z24.s\n"
+      "smin z14.s, p2/M, z14.s, z24.s\n"
+      "smin z15.s, p2/M, z15.s, z24.s\n"
+      "smin z16.s, p2/M, z16.s, z24.s\n"
+      "smin z17.s, p2/M, z17.s, z24.s\n"
+      "smin z18.s, p2/M, z18.s, z24.s\n"
+      "smin z19.s, p2/M, z19.s, z24.s\n"
+      "smin z20.s, p2/M, z20.s, z24.s\n"
+      "smin z21.s, p2/M, z21.s, z24.s\n"
+      "smin z22.s, p2/M, z22.s, z24.s\n"
+      "smin z23.s, p2/M, z23.s, z24.s\n"
+      "smax z8.s, p2/M, z8.s, z25.s\n"
+      "smax z9.s, p2/M, z9.s, z25.s\n"
+      "smax z10.s, p2/M, z10.s, z25.s\n"
       "uzp1 z8.h, z8.h, z9.h\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z25.s\n"
+      "smax z12.s, p2/M, z12.s, z25.s\n"
+      "uzp1 z24.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z24.b\n"
+      "smax z13.s, p2/M, z13.s, z25.s\n"
+      "smax z14.s, p2/M, z14.s, z25.s\n"
       "uzp1 z12.h, z12.h, z13.h\n"
       "st1b { z8.b }, p1, [x11]\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z25.s\n"
+      "smax z16.s, p2/M, z16.s, z25.s\n"
+      "uzp1 z24.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z24.b\n"
+      "smax z17.s, p2/M, z17.s, z25.s\n"
+      "smax z18.s, p2/M, z18.s, z25.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "st1b { z12.b }, p1, [x24]\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "st1b { z12.b }, p1, [x26]\n"
+      "smax z19.s, p2/M, z19.s, z25.s\n"
+      "smax z20.s, p2/M, z20.s, z25.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z25.s\n"
+      "smax z22.s, p2/M, z22.s, z25.s\n"
       "uzp1 z20.h, z20.h, z21.h\n"
-      "st1b { z16.b }, p1, [x23]\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "st1b { z20.b }, p1, [x22]\n"
+      "st1b { z16.b }, p1, [x25]\n"
+      "smax z23.s, p2/M, z23.s, z25.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "st1b { z20.b }, p1, [x24]\n"
       "addvl x11, x11, #1\n"
       "52:"  // Height 4: Writeback done
       "decw x10, ALL, MUL #4\n"
@@ -1454,15 +1454,15 @@
       "56:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 57f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 58f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1473,124 +1473,124 @@
       "b 58f\n"
       "57:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "58:"  // Height 5: input setup done
       "cmp x27, #0x10\n"
       "ble 60f\n"
       "59:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z4.b }, p0/Z, [x26]\n"
+      "ld1rqb { z3.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
       "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "ld1rqb { z4.b }, p0/Z, [x22]\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
+      "ld1rqb { z0.b }, p0/Z, [x22]\n"
+      "ld1b { z29.b }, p2/Z, [x9]\n"
+      "sdot z8.s, z29.b, z4.b[0]\n"
+      "sdot z12.s, z29.b, z3.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z16.s, z29.b, z2.b[0]\n"
+      "sdot z20.s, z29.b, z1.b[0]\n"
       "add x25, x25, #0x10\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z24.s, z29.b, z0.b[0]\n"
+      "sdot z9.s, z28.b, z4.b[0]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z13.s, z28.b, z3.b[0]\n"
+      "sdot z17.s, z28.b, z2.b[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "sdot z21.s, z28.b, z1.b[0]\n"
+      "sdot z25.s, z28.b, z0.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z29.b, z4.b[0]\n"
+      "sdot z14.s, z29.b, z3.b[0]\n"
+      "sdot z18.s, z29.b, z2.b[0]\n"
+      "sdot z22.s, z29.b, z1.b[0]\n"
+      "sdot z26.s, z29.b, z0.b[0]\n"
+      "sdot z11.s, z28.b, z4.b[0]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "sdot z15.s, z28.b, z3.b[0]\n"
+      "sdot z19.s, z28.b, z2.b[0]\n"
+      "sdot z23.s, z28.b, z1.b[0]\n"
+      "sdot z27.s, z28.b, z0.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "sdot z8.s, z29.b, z4.b[1]\n"
+      "sdot z12.s, z29.b, z3.b[1]\n"
+      "sdot z16.s, z29.b, z2.b[1]\n"
+      "sdot z20.s, z29.b, z1.b[1]\n"
+      "sdot z24.s, z29.b, z0.b[1]\n"
+      "sdot z9.s, z28.b, z4.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "sdot z13.s, z28.b, z3.b[1]\n"
+      "sdot z17.s, z28.b, z2.b[1]\n"
+      "sdot z21.s, z28.b, z1.b[1]\n"
+      "sdot z25.s, z28.b, z0.b[1]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
+      "sdot z10.s, z29.b, z4.b[1]\n"
+      "sdot z14.s, z29.b, z3.b[1]\n"
+      "sdot z18.s, z29.b, z2.b[1]\n"
+      "sdot z22.s, z29.b, z1.b[1]\n"
+      "sdot z26.s, z29.b, z0.b[1]\n"
+      "sdot z11.s, z28.b, z4.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "sdot z15.s, z28.b, z3.b[1]\n"
+      "sdot z19.s, z28.b, z2.b[1]\n"
+      "sdot z23.s, z28.b, z1.b[1]\n"
+      "sdot z27.s, z28.b, z0.b[1]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      "sdot z8.s, z29.b, z4.b[2]\n"
+      "sdot z12.s, z29.b, z3.b[2]\n"
+      "sdot z16.s, z29.b, z2.b[2]\n"
+      "sdot z20.s, z29.b, z1.b[2]\n"
+      "sdot z24.s, z29.b, z0.b[2]\n"
+      "sdot z9.s, z28.b, z4.b[2]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "sdot z13.s, z28.b, z3.b[2]\n"
+      "sdot z17.s, z28.b, z2.b[2]\n"
+      "sdot z21.s, z28.b, z1.b[2]\n"
+      "sdot z25.s, z28.b, z0.b[2]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      "sdot z10.s, z29.b, z4.b[2]\n"
+      "sdot z14.s, z29.b, z3.b[2]\n"
+      "sdot z18.s, z29.b, z2.b[2]\n"
+      "sdot z22.s, z29.b, z1.b[2]\n"
+      "sdot z26.s, z29.b, z0.b[2]\n"
+      "sdot z11.s, z28.b, z4.b[2]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "sdot z15.s, z28.b, z3.b[2]\n"
+      "sdot z19.s, z28.b, z2.b[2]\n"
+      "sdot z23.s, z28.b, z1.b[2]\n"
+      "sdot z27.s, z28.b, z0.b[2]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      "sdot z8.s, z29.b, z4.b[3]\n"
+      "sdot z12.s, z29.b, z3.b[3]\n"
+      "sdot z16.s, z29.b, z2.b[3]\n"
+      "sdot z20.s, z29.b, z1.b[3]\n"
+      "sdot z24.s, z29.b, z0.b[3]\n"
+      "sdot z9.s, z28.b, z4.b[3]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "sdot z13.s, z28.b, z3.b[3]\n"
+      "sdot z17.s, z28.b, z2.b[3]\n"
+      "sdot z21.s, z28.b, z1.b[3]\n"
+      "sdot z25.s, z28.b, z0.b[3]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "sdot z10.s, z29.b, z4.b[3]\n"
+      "sdot z14.s, z29.b, z3.b[3]\n"
+      "sdot z18.s, z29.b, z2.b[3]\n"
+      "sdot z22.s, z29.b, z1.b[3]\n"
+      "sdot z26.s, z29.b, z0.b[3]\n"
+      "sdot z11.s, z28.b, z4.b[3]\n"
+      "sdot z15.s, z28.b, z3.b[3]\n"
+      "sdot z19.s, z28.b, z2.b[3]\n"
+      "sdot z23.s, z28.b, z1.b[3]\n"
+      "sdot z27.s, z28.b, z0.b[3]\n"
       "bgt 59b\n"
       "60:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
@@ -1600,146 +1600,146 @@
       "ld1rqb { z2.b }, p0/Z, [x24]\n"
       "ld1rqb { z3.b }, p0/Z, [x23]\n"
       "ld1rqb { z4.b }, p0/Z, [x22]\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x9]\n"
+      "sdot z8.s, z29.b, z0.b[0]\n"
+      "sdot z12.s, z29.b, z1.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z16.s, z29.b, z2.b[0]\n"
+      "sdot z20.s, z29.b, z3.b[0]\n"
+      "sdot z24.s, z29.b, z4.b[0]\n"
+      "sdot z9.s, z28.b, z0.b[0]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[0]\n"
+      "sdot z17.s, z28.b, z2.b[0]\n"
+      "sdot z21.s, z28.b, z3.b[0]\n"
+      "sdot z25.s, z28.b, z4.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
+      "sdot z10.s, z29.b, z0.b[0]\n"
+      "sdot z14.s, z29.b, z1.b[0]\n"
+      "sdot z18.s, z29.b, z2.b[0]\n"
+      "sdot z22.s, z29.b, z3.b[0]\n"
+      "sdot z26.s, z29.b, z4.b[0]\n"
+      "sdot z11.s, z28.b, z0.b[0]\n"
+      "sdot z15.s, z28.b, z1.b[0]\n"
+      "sdot z19.s, z28.b, z2.b[0]\n"
+      "sdot z23.s, z28.b, z3.b[0]\n"
+      "sdot z27.s, z28.b, z4.b[0]\n"
       "ble 61f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x9]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z29.b, z0.b[1]\n"
+      "sdot z12.s, z29.b, z1.b[1]\n"
+      "sdot z16.s, z29.b, z2.b[1]\n"
+      "sdot z20.s, z29.b, z3.b[1]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z24.s, z29.b, z4.b[1]\n"
+      "sdot z9.s, z28.b, z0.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[1]\n"
+      "sdot z17.s, z28.b, z2.b[1]\n"
+      "sdot z21.s, z28.b, z3.b[1]\n"
+      "sdot z25.s, z28.b, z4.b[1]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
+      "sdot z10.s, z29.b, z0.b[1]\n"
+      "sdot z14.s, z29.b, z1.b[1]\n"
+      "sdot z18.s, z29.b, z2.b[1]\n"
+      "sdot z22.s, z29.b, z3.b[1]\n"
+      "sdot z26.s, z29.b, z4.b[1]\n"
+      "sdot z11.s, z28.b, z0.b[1]\n"
+      "sdot z15.s, z28.b, z1.b[1]\n"
+      "sdot z19.s, z28.b, z2.b[1]\n"
+      "sdot z23.s, z28.b, z3.b[1]\n"
+      "sdot z27.s, z28.b, z4.b[1]\n"
       "ble 61f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z29.b }, p2/Z, [x9]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z29.b, z0.b[2]\n"
+      "sdot z12.s, z29.b, z1.b[2]\n"
+      "sdot z16.s, z29.b, z2.b[2]\n"
+      "sdot z20.s, z29.b, z3.b[2]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z24.s, z29.b, z4.b[2]\n"
+      "sdot z9.s, z28.b, z0.b[2]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[2]\n"
+      "sdot z17.s, z28.b, z2.b[2]\n"
+      "sdot z21.s, z28.b, z3.b[2]\n"
+      "sdot z25.s, z28.b, z4.b[2]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
+      "sdot z10.s, z29.b, z0.b[2]\n"
+      "sdot z14.s, z29.b, z1.b[2]\n"
+      "sdot z18.s, z29.b, z2.b[2]\n"
+      "sdot z22.s, z29.b, z3.b[2]\n"
+      "sdot z26.s, z29.b, z4.b[2]\n"
+      "sdot z11.s, z28.b, z0.b[2]\n"
+      "sdot z15.s, z28.b, z1.b[2]\n"
+      "sdot z19.s, z28.b, z2.b[2]\n"
+      "sdot z23.s, z28.b, z3.b[2]\n"
+      "sdot z27.s, z28.b, z4.b[2]\n"
       "ble 61f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x9]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z29.b, z0.b[3]\n"
+      "sdot z12.s, z29.b, z1.b[3]\n"
+      "sdot z16.s, z29.b, z2.b[3]\n"
+      "sdot z20.s, z29.b, z3.b[3]\n"
+      "sdot z24.s, z29.b, z4.b[3]\n"
+      "sdot z9.s, z28.b, z0.b[3]\n"
+      "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[3]\n"
+      "sdot z17.s, z28.b, z2.b[3]\n"
+      "sdot z21.s, z28.b, z3.b[3]\n"
+      "sdot z25.s, z28.b, z4.b[3]\n"
+      "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
+      "sdot z10.s, z29.b, z0.b[3]\n"
+      "sdot z14.s, z29.b, z1.b[3]\n"
+      "sdot z18.s, z29.b, z2.b[3]\n"
+      "sdot z22.s, z29.b, z3.b[3]\n"
+      "sdot z26.s, z29.b, z4.b[3]\n"
+      "sdot z11.s, z28.b, z0.b[3]\n"
+      "sdot z15.s, z28.b, z1.b[3]\n"
+      "sdot z19.s, z28.b, z2.b[3]\n"
+      "sdot z23.s, z28.b, z3.b[3]\n"
+      "sdot z27.s, z28.b, z4.b[3]\n"
       "61:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 56b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x11, x20\n"
-      "ld1w { z0.s }, p2/Z, [x14]\n"
+      "add x26, x11, x20\n"
+      "ld1w { z31.s }, p2/Z, [x14]\n"
+      "add x25, x26, x20\n"
+      "ld1w { z30.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z29.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "add x24, x25, x20\n"
       "add x23, x24, x20\n"
-      "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
-      "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
-      "add z8.s, z8.s, z0.s\n"
-      "add z9.s, z9.s, z1.s\n"
+      "ld1w { z28.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add z8.s, z8.s, z31.s\n"
+      "add z9.s, z9.s, z30.s\n"
       "addvl x14, x14, #4\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
-      "add z12.s, z12.s, z0.s\n"
-      "add z13.s, z13.s, z1.s\n"
-      "add z14.s, z14.s, z2.s\n"
-      "add z15.s, z15.s, z3.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
-      "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
+      "add z10.s, z10.s, z29.s\n"
+      "add z11.s, z11.s, z28.s\n"
+      "add z12.s, z12.s, z31.s\n"
+      "add z13.s, z13.s, z30.s\n"
+      "add z14.s, z14.s, z29.s\n"
+      "add z15.s, z15.s, z28.s\n"
+      "add z16.s, z16.s, z31.s\n"
+      "add z17.s, z17.s, z30.s\n"
+      "add z18.s, z18.s, z29.s\n"
+      "add z19.s, z19.s, z28.s\n"
+      "add z20.s, z20.s, z31.s\n"
+      "add z21.s, z21.s, z30.s\n"
+      "add z22.s, z22.s, z29.s\n"
+      "add z23.s, z23.s, z28.s\n"
+      "add z24.s, z24.s, z31.s\n"
+      "add z25.s, z25.s, z30.s\n"
+      "add z26.s, z26.s, z29.s\n"
+      "add z27.s, z27.s, z28.s\n"
       "tbz %x[flags], #4, 62f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -1753,10 +1753,10 @@
       "addvl x13, x13, #4\n"
       "b 63f\n"
       "62:"  // Height 5: per layer parameters
-      "add x26, %x[qp], %[per_layer_right_shift]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z0.s }, p2/Z, [x26]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
       "mov z5.d, z4.d\n"
       "mov z2.d, z0.d\n"
@@ -1785,173 +1785,173 @@
       ".inst 0x04a6775a  // sqrdmulh z26.s, z26.s, z6.s\n"
       ".inst 0x04a7777b  // sqrdmulh z27.s, z27.s, z7.s\n"
       "tbz %x[flags], #5, 64f\n"
-      "and z4.d, z8.d, z0.d\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "sqadd z11.s, z11.s, z7.s\n"
-      "and z4.d, z12.d, z0.d\n"
-      "and z5.d, z13.d, z1.d\n"
-      "and z6.d, z14.d, z2.d\n"
-      "and z7.d, z15.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z12.s, z12.s, z4.s\n"
-      "sqadd z13.s, z13.s, z5.s\n"
-      "sqadd z14.s, z14.s, z6.s\n"
-      "sqadd z15.s, z15.s, z7.s\n"
-      "and z4.d, z16.d, z0.d\n"
-      "and z5.d, z17.d, z1.d\n"
-      "and z6.d, z18.d, z2.d\n"
-      "and z7.d, z19.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "and z4.d, z20.d, z0.d\n"
-      "and z5.d, z21.d, z1.d\n"
-      "and z6.d, z22.d, z2.d\n"
-      "and z7.d, z23.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z20.s, z20.s, z4.s\n"
-      "sqadd z21.s, z21.s, z5.s\n"
-      "sqadd z22.s, z22.s, z6.s\n"
-      "sqadd z23.s, z23.s, z7.s\n"
-      "and z4.d, z24.d, z0.d\n"
-      "and z5.d, z25.d, z1.d\n"
-      "and z6.d, z26.d, z2.d\n"
-      "and z7.d, z27.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z24.s, z24.s, z4.s\n"
-      "sqadd z25.s, z25.s, z5.s\n"
-      "sqadd z26.s, z26.s, z6.s\n"
-      "sqadd z27.s, z27.s, z7.s\n"
+      "and z31.d, z8.d, z0.d\n"
+      "and z30.d, z9.d, z1.d\n"
+      "and z29.d, z10.d, z2.d\n"
+      "and z28.d, z11.d, z3.d\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z31.s\n"
+      "sqadd z9.s, z9.s, z30.s\n"
+      "sqadd z10.s, z10.s, z29.s\n"
+      "sqadd z11.s, z11.s, z28.s\n"
+      "and z31.d, z12.d, z0.d\n"
+      "and z30.d, z13.d, z1.d\n"
+      "and z29.d, z14.d, z2.d\n"
+      "and z28.d, z15.d, z3.d\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z31.s\n"
+      "sqadd z13.s, z13.s, z30.s\n"
+      "sqadd z14.s, z14.s, z29.s\n"
+      "sqadd z15.s, z15.s, z28.s\n"
+      "and z31.d, z16.d, z0.d\n"
+      "and z30.d, z17.d, z1.d\n"
+      "and z29.d, z18.d, z2.d\n"
+      "and z28.d, z19.d, z3.d\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z31.s\n"
+      "sqadd z17.s, z17.s, z30.s\n"
+      "sqadd z18.s, z18.s, z29.s\n"
+      "sqadd z19.s, z19.s, z28.s\n"
+      "and z31.d, z20.d, z0.d\n"
+      "and z30.d, z21.d, z1.d\n"
+      "and z29.d, z22.d, z2.d\n"
+      "and z28.d, z23.d, z3.d\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z31.s\n"
+      "sqadd z21.s, z21.s, z30.s\n"
+      "sqadd z22.s, z22.s, z29.s\n"
+      "sqadd z23.s, z23.s, z28.s\n"
+      "and z31.d, z24.d, z0.d\n"
+      "and z30.d, z25.d, z1.d\n"
+      "and z29.d, z26.d, z2.d\n"
+      "and z28.d, z27.d, z3.d\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z31.s\n"
+      "sqadd z25.s, z25.s, z30.s\n"
+      "sqadd z26.s, z26.s, z29.s\n"
+      "sqadd z27.s, z27.s, z28.s\n"
       "64:"  // Height 5: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z29.s }, p2/Z, [x20]\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add z8.s, z8.s, z4.s\n"
+      "add z8.s, z8.s, z29.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
+      "add z9.s, z9.s, z29.s\n"
+      "add z10.s, z10.s, z29.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
       ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
-      "add z11.s, z11.s, z4.s\n"
-      "add z12.s, z12.s, z4.s\n"
+      "add z11.s, z11.s, z29.s\n"
+      "add z12.s, z12.s, z29.s\n"
       ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
       ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
-      "add z13.s, z13.s, z4.s\n"
-      "add z14.s, z14.s, z4.s\n"
+      "add z13.s, z13.s, z29.s\n"
+      "add z14.s, z14.s, z29.s\n"
       ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z15.s, z15.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z15.s, z15.s, z29.s\n"
+      "add z16.s, z16.s, z29.s\n"
       ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
       ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z29.s\n"
+      "add z18.s, z18.s, z29.s\n"
       ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z20.s, z20.s, z4.s\n"
+      "add z19.s, z19.s, z29.s\n"
+      "add z20.s, z20.s, z29.s\n"
       ".inst 0x44828835  // srshl z21.s, p2/M, z21.s, z1.s\n"
       ".inst 0x44828856  // srshl z22.s, p2/M, z22.s, z2.s\n"
-      "add z21.s, z21.s, z4.s\n"
-      "add z22.s, z22.s, z4.s\n"
+      "add z21.s, z21.s, z29.s\n"
+      "add z22.s, z22.s, z29.s\n"
       ".inst 0x44828877  // srshl z23.s, p2/M, z23.s, z3.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      "add z23.s, z23.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
+      "add z23.s, z23.s, z29.s\n"
+      "add z24.s, z24.s, z29.s\n"
       ".inst 0x44828839  // srshl z25.s, p2/M, z25.s, z1.s\n"
       ".inst 0x4482885a  // srshl z26.s, p2/M, z26.s, z2.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "add z26.s, z26.s, z4.s\n"
+      "add z25.s, z25.s, z29.s\n"
+      "add z26.s, z26.s, z29.s\n"
       ".inst 0x4482887b  // srshl z27.s, p2/M, z27.s, z3.s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x25]\n"
-      "add z27.s, z27.s, z4.s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x25]\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z29.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z29.s }, p2/Z, [x20]\n"
+      "smin z8.s, p2/M, z8.s, z28.s\n"
+      "smin z9.s, p2/M, z9.s, z28.s\n"
+      "smin z10.s, p2/M, z10.s, z28.s\n"
+      "smin z11.s, p2/M, z11.s, z28.s\n"
+      "smin z12.s, p2/M, z12.s, z28.s\n"
+      "smin z13.s, p2/M, z13.s, z28.s\n"
+      "smin z14.s, p2/M, z14.s, z28.s\n"
+      "smin z15.s, p2/M, z15.s, z28.s\n"
+      "smin z16.s, p2/M, z16.s, z28.s\n"
+      "smin z17.s, p2/M, z17.s, z28.s\n"
+      "smin z18.s, p2/M, z18.s, z28.s\n"
+      "smin z19.s, p2/M, z19.s, z28.s\n"
+      "smin z20.s, p2/M, z20.s, z28.s\n"
+      "smin z21.s, p2/M, z21.s, z28.s\n"
+      "smin z22.s, p2/M, z22.s, z28.s\n"
+      "smin z23.s, p2/M, z23.s, z28.s\n"
+      "smin z24.s, p2/M, z24.s, z28.s\n"
+      "smin z25.s, p2/M, z25.s, z28.s\n"
+      "smin z26.s, p2/M, z26.s, z28.s\n"
+      "smin z27.s, p2/M, z27.s, z28.s\n"
+      "smax z8.s, p2/M, z8.s, z29.s\n"
+      "smax z9.s, p2/M, z9.s, z29.s\n"
+      "smax z10.s, p2/M, z10.s, z29.s\n"
       "uzp1 z8.h, z8.h, z9.h\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z29.s\n"
+      "smax z12.s, p2/M, z12.s, z29.s\n"
+      "uzp1 z28.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z28.b\n"
+      "smax z13.s, p2/M, z13.s, z29.s\n"
+      "smax z14.s, p2/M, z14.s, z29.s\n"
       "uzp1 z12.h, z12.h, z13.h\n"
       "st1b { z8.b }, p1, [x11]\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z29.s\n"
+      "smax z16.s, p2/M, z16.s, z29.s\n"
+      "uzp1 z28.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z28.b\n"
+      "smax z17.s, p2/M, z17.s, z29.s\n"
+      "smax z18.s, p2/M, z18.s, z29.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "st1b { z12.b }, p1, [x24]\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "st1b { z12.b }, p1, [x26]\n"
+      "smax z19.s, p2/M, z19.s, z29.s\n"
+      "smax z20.s, p2/M, z20.s, z29.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z29.s\n"
+      "smax z22.s, p2/M, z22.s, z29.s\n"
       "uzp1 z20.h, z20.h, z21.h\n"
-      "st1b { z16.b }, p1, [x23]\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "st1b { z16.b }, p1, [x25]\n"
+      "smax z23.s, p2/M, z23.s, z29.s\n"
+      "smax z24.s, p2/M, z24.s, z29.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z29.s\n"
+      "smax z26.s, p2/M, z26.s, z29.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
-      "st1b { z20.b }, p1, [x22]\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "st1b { z24.b }, p1, [x21]\n"
+      "st1b { z20.b }, p1, [x24]\n"
+      "smax z27.s, p2/M, z27.s, z29.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
       "addvl x11, x11, #1\n"
       "65:"  // Height 5: Writeback done
       "decw x10, ALL, MUL #4\n"
@@ -2000,16 +2000,16 @@
       "69:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 70f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 71f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -2021,143 +2021,143 @@
       "b 71f\n"
       "70:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "71:"  // Height 6: input setup done
       "cmp x27, #0x10\n"
       "ble 73f\n"
       "72:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z7.b }, p0/Z, [x26]\n"
+      "ld1rqb { z6.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
-      "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z5.b }, p0/Z, [x24]\n"
+      "ld1rqb { z4.b }, p0/Z, [x23]\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "ld1rqb { z4.b }, p0/Z, [x22]\n"
-      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z2.b }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z1.b }, p2/Z, [x9]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[0]\n"
+      "sdot z12.s, z1.b, z6.b[0]\n"
+      "sdot z16.s, z1.b, z5.b[0]\n"
+      "sdot z20.s, z1.b, z4.b[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "sdot z28.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z24.s, z1.b, z3.b[0]\n"
+      "sdot z28.s, z1.b, z2.b[0]\n"
+      "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "sdot z29.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "sdot z30.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
-      "sdot z31.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "sdot z28.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "sdot z29.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "sdot z9.s, z0.b, z7.b[0]\n"
+      "sdot z13.s, z0.b, z6.b[0]\n"
+      "sdot z17.s, z0.b, z5.b[0]\n"
+      "sdot z21.s, z0.b, z4.b[0]\n"
+      "sdot z25.s, z0.b, z3.b[0]\n"
+      "sdot z29.s, z0.b, z2.b[0]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z10.s, z1.b, z7.b[0]\n"
+      "sdot z14.s, z1.b, z6.b[0]\n"
+      "sdot z18.s, z1.b, z5.b[0]\n"
+      "sdot z22.s, z1.b, z4.b[0]\n"
+      "sdot z26.s, z1.b, z3.b[0]\n"
+      "sdot z30.s, z1.b, z2.b[0]\n"
+      "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "sdot z11.s, z0.b, z7.b[0]\n"
+      "sdot z15.s, z0.b, z6.b[0]\n"
+      "sdot z19.s, z0.b, z5.b[0]\n"
+      "sdot z23.s, z0.b, z4.b[0]\n"
+      "sdot z27.s, z0.b, z3.b[0]\n"
+      "sdot z31.s, z0.b, z2.b[0]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[1]\n"
+      "sdot z12.s, z1.b, z6.b[1]\n"
+      "sdot z16.s, z1.b, z5.b[1]\n"
+      "sdot z20.s, z1.b, z4.b[1]\n"
+      "sdot z24.s, z1.b, z3.b[1]\n"
+      "sdot z28.s, z1.b, z2.b[1]\n"
+      "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "sdot z9.s, z0.b, z7.b[1]\n"
+      "sdot z13.s, z0.b, z6.b[1]\n"
+      "sdot z17.s, z0.b, z5.b[1]\n"
+      "sdot z21.s, z0.b, z4.b[1]\n"
+      "sdot z25.s, z0.b, z3.b[1]\n"
+      "sdot z29.s, z0.b, z2.b[1]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "sdot z30.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
-      "sdot z31.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "sdot z28.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "sdot z29.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "sdot z30.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
-      "sdot z31.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "sdot z28.s, z6.b, z5.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "sdot z29.s, z7.b, z5.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z30.s, z6.b, z5.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
-      "sdot z31.s, z7.b, z5.b[3]\n"
+      "sdot z10.s, z1.b, z7.b[1]\n"
+      "sdot z14.s, z1.b, z6.b[1]\n"
+      "sdot z18.s, z1.b, z5.b[1]\n"
+      "sdot z22.s, z1.b, z4.b[1]\n"
+      "sdot z26.s, z1.b, z3.b[1]\n"
+      "sdot z30.s, z1.b, z2.b[1]\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "sdot z11.s, z0.b, z7.b[1]\n"
+      "sdot z15.s, z0.b, z6.b[1]\n"
+      "sdot z19.s, z0.b, z5.b[1]\n"
+      "sdot z23.s, z0.b, z4.b[1]\n"
+      "sdot z27.s, z0.b, z3.b[1]\n"
+      "sdot z31.s, z0.b, z2.b[1]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[2]\n"
+      "sdot z12.s, z1.b, z6.b[2]\n"
+      "sdot z16.s, z1.b, z5.b[2]\n"
+      "sdot z20.s, z1.b, z4.b[2]\n"
+      "sdot z24.s, z1.b, z3.b[2]\n"
+      "sdot z28.s, z1.b, z2.b[2]\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "sdot z9.s, z0.b, z7.b[2]\n"
+      "sdot z13.s, z0.b, z6.b[2]\n"
+      "sdot z17.s, z0.b, z5.b[2]\n"
+      "sdot z21.s, z0.b, z4.b[2]\n"
+      "sdot z25.s, z0.b, z3.b[2]\n"
+      "sdot z29.s, z0.b, z2.b[2]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      "sdot z10.s, z1.b, z7.b[2]\n"
+      "sdot z14.s, z1.b, z6.b[2]\n"
+      "sdot z18.s, z1.b, z5.b[2]\n"
+      "sdot z22.s, z1.b, z4.b[2]\n"
+      "sdot z26.s, z1.b, z3.b[2]\n"
+      "sdot z30.s, z1.b, z2.b[2]\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "sdot z11.s, z0.b, z7.b[2]\n"
+      "sdot z15.s, z0.b, z6.b[2]\n"
+      "sdot z19.s, z0.b, z5.b[2]\n"
+      "sdot z23.s, z0.b, z4.b[2]\n"
+      "sdot z27.s, z0.b, z3.b[2]\n"
+      "sdot z31.s, z0.b, z2.b[2]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[3]\n"
+      "sdot z12.s, z1.b, z6.b[3]\n"
+      "sdot z16.s, z1.b, z5.b[3]\n"
+      "sdot z20.s, z1.b, z4.b[3]\n"
+      "sdot z24.s, z1.b, z3.b[3]\n"
+      "sdot z28.s, z1.b, z2.b[3]\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "sdot z9.s, z0.b, z7.b[3]\n"
+      "sdot z13.s, z0.b, z6.b[3]\n"
+      "sdot z17.s, z0.b, z5.b[3]\n"
+      "sdot z21.s, z0.b, z4.b[3]\n"
+      "sdot z25.s, z0.b, z3.b[3]\n"
+      "sdot z29.s, z0.b, z2.b[3]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "sdot z10.s, z1.b, z7.b[3]\n"
+      "sdot z14.s, z1.b, z6.b[3]\n"
+      "sdot z18.s, z1.b, z5.b[3]\n"
+      "sdot z22.s, z1.b, z4.b[3]\n"
+      "sdot z26.s, z1.b, z3.b[3]\n"
+      "sdot z30.s, z1.b, z2.b[3]\n"
+      "sdot z11.s, z0.b, z7.b[3]\n"
+      "sdot z15.s, z0.b, z6.b[3]\n"
+      "sdot z19.s, z0.b, z5.b[3]\n"
+      "sdot z23.s, z0.b, z4.b[3]\n"
+      "sdot z27.s, z0.b, z3.b[3]\n"
+      "sdot z31.s, z0.b, z2.b[3]\n"
       "bgt 72b\n"
       "73:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
@@ -2168,167 +2168,167 @@
       "ld1rqb { z3.b }, p0/Z, [x23]\n"
       "ld1rqb { z4.b }, p0/Z, [x22]\n"
       "ld1rqb { z5.b }, p0/Z, [x21]\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "sdot z28.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "sdot z29.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x9]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[0]\n"
+      "sdot z12.s, z7.b, z1.b[0]\n"
+      "sdot z16.s, z7.b, z2.b[0]\n"
+      "sdot z20.s, z7.b, z3.b[0]\n"
+      "sdot z24.s, z7.b, z4.b[0]\n"
+      "sdot z28.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[0]\n"
+      "sdot z13.s, z6.b, z1.b[0]\n"
+      "sdot z17.s, z6.b, z2.b[0]\n"
+      "sdot z21.s, z6.b, z3.b[0]\n"
+      "sdot z25.s, z6.b, z4.b[0]\n"
+      "sdot z29.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "sdot z30.s, z6.b, z5.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
-      "sdot z31.s, z7.b, z5.b[0]\n"
+      "sdot z10.s, z7.b, z0.b[0]\n"
+      "sdot z14.s, z7.b, z1.b[0]\n"
+      "sdot z18.s, z7.b, z2.b[0]\n"
+      "sdot z22.s, z7.b, z3.b[0]\n"
+      "sdot z26.s, z7.b, z4.b[0]\n"
+      "sdot z30.s, z7.b, z5.b[0]\n"
+      "sdot z11.s, z6.b, z0.b[0]\n"
+      "sdot z15.s, z6.b, z1.b[0]\n"
+      "sdot z19.s, z6.b, z2.b[0]\n"
+      "sdot z23.s, z6.b, z3.b[0]\n"
+      "sdot z27.s, z6.b, z4.b[0]\n"
+      "sdot z31.s, z6.b, z5.b[0]\n"
       "ble 74f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x9]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[1]\n"
+      "sdot z12.s, z7.b, z1.b[1]\n"
+      "sdot z16.s, z7.b, z2.b[1]\n"
+      "sdot z20.s, z7.b, z3.b[1]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "sdot z28.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "sdot z29.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z24.s, z7.b, z4.b[1]\n"
+      "sdot z28.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[1]\n"
+      "sdot z13.s, z6.b, z1.b[1]\n"
+      "sdot z17.s, z6.b, z2.b[1]\n"
+      "sdot z21.s, z6.b, z3.b[1]\n"
+      "sdot z25.s, z6.b, z4.b[1]\n"
+      "sdot z29.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "sdot z30.s, z6.b, z5.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
-      "sdot z31.s, z7.b, z5.b[1]\n"
+      "sdot z10.s, z7.b, z0.b[1]\n"
+      "sdot z14.s, z7.b, z1.b[1]\n"
+      "sdot z18.s, z7.b, z2.b[1]\n"
+      "sdot z22.s, z7.b, z3.b[1]\n"
+      "sdot z26.s, z7.b, z4.b[1]\n"
+      "sdot z30.s, z7.b, z5.b[1]\n"
+      "sdot z11.s, z6.b, z0.b[1]\n"
+      "sdot z15.s, z6.b, z1.b[1]\n"
+      "sdot z19.s, z6.b, z2.b[1]\n"
+      "sdot z23.s, z6.b, z3.b[1]\n"
+      "sdot z27.s, z6.b, z4.b[1]\n"
+      "sdot z31.s, z6.b, z5.b[1]\n"
       "ble 74f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x9]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[2]\n"
+      "sdot z12.s, z7.b, z1.b[2]\n"
+      "sdot z16.s, z7.b, z2.b[2]\n"
+      "sdot z20.s, z7.b, z3.b[2]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "sdot z28.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "sdot z29.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "sdot z24.s, z7.b, z4.b[2]\n"
+      "sdot z28.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[2]\n"
+      "sdot z13.s, z6.b, z1.b[2]\n"
+      "sdot z17.s, z6.b, z2.b[2]\n"
+      "sdot z21.s, z6.b, z3.b[2]\n"
+      "sdot z25.s, z6.b, z4.b[2]\n"
+      "sdot z29.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "sdot z30.s, z6.b, z5.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
-      "sdot z31.s, z7.b, z5.b[2]\n"
+      "sdot z10.s, z7.b, z0.b[2]\n"
+      "sdot z14.s, z7.b, z1.b[2]\n"
+      "sdot z18.s, z7.b, z2.b[2]\n"
+      "sdot z22.s, z7.b, z3.b[2]\n"
+      "sdot z26.s, z7.b, z4.b[2]\n"
+      "sdot z30.s, z7.b, z5.b[2]\n"
+      "sdot z11.s, z6.b, z0.b[2]\n"
+      "sdot z15.s, z6.b, z1.b[2]\n"
+      "sdot z19.s, z6.b, z2.b[2]\n"
+      "sdot z23.s, z6.b, z3.b[2]\n"
+      "sdot z27.s, z6.b, z4.b[2]\n"
+      "sdot z31.s, z6.b, z5.b[2]\n"
       "ble 74f\n"
-      "ld1b { z6.b }, p2/Z, [x9]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "sdot z28.s, z6.b, z5.b[3]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "sdot z29.s, z7.b, z5.b[3]\n"
-      "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x9]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[3]\n"
+      "sdot z12.s, z7.b, z1.b[3]\n"
+      "sdot z16.s, z7.b, z2.b[3]\n"
+      "sdot z20.s, z7.b, z3.b[3]\n"
+      "sdot z24.s, z7.b, z4.b[3]\n"
+      "sdot z28.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[3]\n"
+      "sdot z13.s, z6.b, z1.b[3]\n"
+      "sdot z17.s, z6.b, z2.b[3]\n"
+      "sdot z21.s, z6.b, z3.b[3]\n"
+      "sdot z25.s, z6.b, z4.b[3]\n"
+      "sdot z29.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z30.s, z6.b, z5.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
-      "sdot z31.s, z7.b, z5.b[3]\n"
+      "sdot z10.s, z7.b, z0.b[3]\n"
+      "sdot z14.s, z7.b, z1.b[3]\n"
+      "sdot z18.s, z7.b, z2.b[3]\n"
+      "sdot z22.s, z7.b, z3.b[3]\n"
+      "sdot z26.s, z7.b, z4.b[3]\n"
+      "sdot z30.s, z7.b, z5.b[3]\n"
+      "sdot z11.s, z6.b, z0.b[3]\n"
+      "sdot z15.s, z6.b, z1.b[3]\n"
+      "sdot z19.s, z6.b, z2.b[3]\n"
+      "sdot z23.s, z6.b, z3.b[3]\n"
+      "sdot z27.s, z6.b, z4.b[3]\n"
+      "sdot z31.s, z6.b, z5.b[3]\n"
       "74:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 69b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x11, x20\n"
+      "add x26, x11, x20\n"
+      "add x25, x26, x20\n"
+      "ld1w { z3.s }, p2/Z, [x14]\n"
+      "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "add x24, x25, x20\n"
       "add x23, x24, x20\n"
-      "ld1w { z0.s }, p2/Z, [x14]\n"
-      "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n"
       "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
-      "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
-      "add x20, x21, x20\n"
-      "add z8.s, z8.s, z0.s\n"
-      "add z9.s, z9.s, z1.s\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
+      "add z8.s, z8.s, z3.s\n"
+      "add z9.s, z9.s, z2.s\n"
+      "add z10.s, z10.s, z1.s\n"
+      "add z11.s, z11.s, z0.s\n"
       "addvl x14, x14, #4\n"
-      "add z12.s, z12.s, z0.s\n"
-      "add z13.s, z13.s, z1.s\n"
-      "add z14.s, z14.s, z2.s\n"
-      "add z15.s, z15.s, z3.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
-      "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
-      "add z28.s, z28.s, z0.s\n"
-      "add z29.s, z29.s, z1.s\n"
-      "add z30.s, z30.s, z2.s\n"
-      "add z31.s, z31.s, z3.s\n"
+      "add z12.s, z12.s, z3.s\n"
+      "add z13.s, z13.s, z2.s\n"
+      "add z14.s, z14.s, z1.s\n"
+      "add z15.s, z15.s, z0.s\n"
+      "add z16.s, z16.s, z3.s\n"
+      "add z17.s, z17.s, z2.s\n"
+      "add z18.s, z18.s, z1.s\n"
+      "add z19.s, z19.s, z0.s\n"
+      "add z20.s, z20.s, z3.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z1.s\n"
+      "add z23.s, z23.s, z0.s\n"
+      "add z24.s, z24.s, z3.s\n"
+      "add z25.s, z25.s, z2.s\n"
+      "add z26.s, z26.s, z1.s\n"
+      "add z27.s, z27.s, z0.s\n"
+      "add z28.s, z28.s, z3.s\n"
+      "add z29.s, z29.s, z2.s\n"
+      "add z30.s, z30.s, z1.s\n"
+      "add z31.s, z31.s, z0.s\n"
       "tbz %x[flags], #4, 75f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -2342,10 +2342,10 @@
       "addvl x13, x13, #4\n"
       "b 76f\n"
       "75:"  // Height 6: per layer parameters
-      "add x26, %x[qp], %[per_layer_right_shift]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z0.s }, p2/Z, [x26]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
       "mov z5.d, z4.d\n"
       "mov z2.d, z0.d\n"
@@ -2378,81 +2378,81 @@
       ".inst 0x04a677de  // sqrdmulh z30.s, z30.s, z6.s\n"
       ".inst 0x04a777ff  // sqrdmulh z31.s, z31.s, z7.s\n"
       "tbz %x[flags], #5, 77f\n"
-      "and z4.d, z8.d, z0.d\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
+      "and z7.d, z8.d, z0.d\n"
+      "and z6.d, z9.d, z1.d\n"
+      "and z5.d, z10.d, z2.d\n"
+      "and z4.d, z11.d, z3.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "sqadd z11.s, z11.s, z7.s\n"
-      "and z4.d, z12.d, z0.d\n"
-      "and z5.d, z13.d, z1.d\n"
-      "and z6.d, z14.d, z2.d\n"
-      "and z7.d, z15.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z12.s, z12.s, z4.s\n"
-      "sqadd z13.s, z13.s, z5.s\n"
-      "sqadd z14.s, z14.s, z6.s\n"
-      "sqadd z15.s, z15.s, z7.s\n"
-      "and z4.d, z16.d, z0.d\n"
-      "and z5.d, z17.d, z1.d\n"
-      "and z6.d, z18.d, z2.d\n"
-      "and z7.d, z19.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "and z4.d, z20.d, z0.d\n"
-      "and z5.d, z21.d, z1.d\n"
-      "and z6.d, z22.d, z2.d\n"
-      "and z7.d, z23.d, z3.d\n"
       "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z7.s\n"
+      "sqadd z9.s, z9.s, z6.s\n"
+      "sqadd z10.s, z10.s, z5.s\n"
+      "sqadd z11.s, z11.s, z4.s\n"
+      "and z7.d, z12.d, z0.d\n"
+      "and z6.d, z13.d, z1.d\n"
+      "and z5.d, z14.d, z2.d\n"
+      "and z4.d, z15.d, z3.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z20.s, z20.s, z4.s\n"
-      "sqadd z21.s, z21.s, z5.s\n"
-      "sqadd z22.s, z22.s, z6.s\n"
-      "sqadd z23.s, z23.s, z7.s\n"
-      "and z4.d, z24.d, z0.d\n"
-      "and z5.d, z25.d, z1.d\n"
-      "and z6.d, z26.d, z2.d\n"
-      "and z7.d, z27.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z7.s\n"
+      "sqadd z13.s, z13.s, z6.s\n"
+      "sqadd z14.s, z14.s, z5.s\n"
+      "sqadd z15.s, z15.s, z4.s\n"
+      "and z7.d, z16.d, z0.d\n"
+      "and z6.d, z17.d, z1.d\n"
+      "and z5.d, z18.d, z2.d\n"
+      "and z4.d, z19.d, z3.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z24.s, z24.s, z4.s\n"
-      "sqadd z25.s, z25.s, z5.s\n"
-      "sqadd z26.s, z26.s, z6.s\n"
-      "sqadd z27.s, z27.s, z7.s\n"
-      "and z4.d, z28.d, z0.d\n"
-      "and z5.d, z29.d, z1.d\n"
-      "and z6.d, z30.d, z2.d\n"
-      "and z7.d, z31.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z7.s\n"
+      "sqadd z17.s, z17.s, z6.s\n"
+      "sqadd z18.s, z18.s, z5.s\n"
+      "sqadd z19.s, z19.s, z4.s\n"
+      "and z7.d, z20.d, z0.d\n"
+      "and z6.d, z21.d, z1.d\n"
+      "and z5.d, z22.d, z2.d\n"
+      "and z4.d, z23.d, z3.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z28.s, z28.s, z4.s\n"
-      "sqadd z29.s, z29.s, z5.s\n"
-      "sqadd z30.s, z30.s, z6.s\n"
-      "sqadd z31.s, z31.s, z7.s\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z7.s\n"
+      "sqadd z21.s, z21.s, z6.s\n"
+      "sqadd z22.s, z22.s, z5.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "and z7.d, z24.d, z0.d\n"
+      "and z6.d, z25.d, z1.d\n"
+      "and z5.d, z26.d, z2.d\n"
+      "and z4.d, z27.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z7.s\n"
+      "sqadd z25.s, z25.s, z6.s\n"
+      "sqadd z26.s, z26.s, z5.s\n"
+      "sqadd z27.s, z27.s, z4.s\n"
+      "and z7.d, z28.d, z0.d\n"
+      "and z6.d, z29.d, z1.d\n"
+      "and z5.d, z30.d, z2.d\n"
+      "and z4.d, z31.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z28.s, z28.s, z7.s\n"
+      "sqadd z29.s, z29.s, z6.s\n"
+      "sqadd z30.s, z30.s, z5.s\n"
+      "sqadd z31.s, z31.s, z4.s\n"
       "77:"  // Height 6: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
       "add z8.s, z8.s, z4.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
@@ -2500,83 +2500,83 @@
       "add z29.s, z29.s, z4.s\n"
       "add z30.s, z30.s, z4.s\n"
       ".inst 0x4482887f  // srshl z31.s, p2/M, z31.s, z3.s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x25]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
       "add z31.s, z31.s, z4.s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x25]\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smin z28.s, p2/M, z28.s, z6.s\n"
-      "smin z29.s, p2/M, z29.s, z6.s\n"
-      "smin z30.s, p2/M, z30.s, z6.s\n"
-      "smin z31.s, p2/M, z31.s, z6.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "smin z8.s, p2/M, z8.s, z0.s\n"
+      "smin z9.s, p2/M, z9.s, z0.s\n"
+      "smin z10.s, p2/M, z10.s, z0.s\n"
+      "smin z11.s, p2/M, z11.s, z0.s\n"
+      "smin z12.s, p2/M, z12.s, z0.s\n"
+      "smin z13.s, p2/M, z13.s, z0.s\n"
+      "smin z14.s, p2/M, z14.s, z0.s\n"
+      "smin z15.s, p2/M, z15.s, z0.s\n"
+      "smin z16.s, p2/M, z16.s, z0.s\n"
+      "smin z17.s, p2/M, z17.s, z0.s\n"
+      "smin z18.s, p2/M, z18.s, z0.s\n"
+      "smin z19.s, p2/M, z19.s, z0.s\n"
+      "smin z20.s, p2/M, z20.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z0.s\n"
+      "smin z22.s, p2/M, z22.s, z0.s\n"
+      "smin z23.s, p2/M, z23.s, z0.s\n"
+      "smin z24.s, p2/M, z24.s, z0.s\n"
+      "smin z25.s, p2/M, z25.s, z0.s\n"
+      "smin z26.s, p2/M, z26.s, z0.s\n"
+      "smin z27.s, p2/M, z27.s, z0.s\n"
+      "smin z28.s, p2/M, z28.s, z0.s\n"
+      "smin z29.s, p2/M, z29.s, z0.s\n"
+      "smin z30.s, p2/M, z30.s, z0.s\n"
+      "smin z31.s, p2/M, z31.s, z0.s\n"
+      "smax z8.s, p2/M, z8.s, z1.s\n"
+      "smax z9.s, p2/M, z9.s, z1.s\n"
+      "smax z10.s, p2/M, z10.s, z1.s\n"
       "uzp1 z8.h, z8.h, z9.h\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z1.s\n"
+      "smax z12.s, p2/M, z12.s, z1.s\n"
+      "uzp1 z0.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z0.b\n"
+      "smax z13.s, p2/M, z13.s, z1.s\n"
+      "smax z14.s, p2/M, z14.s, z1.s\n"
       "uzp1 z12.h, z12.h, z13.h\n"
       "st1b { z8.b }, p1, [x11]\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "uzp1 z13.h, z14.h, z15.h\n"
-      "uzp1 z12.b, z12.b, z13.b\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z1.s\n"
+      "smax z16.s, p2/M, z16.s, z1.s\n"
+      "uzp1 z0.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z0.b\n"
+      "smax z17.s, p2/M, z17.s, z1.s\n"
+      "smax z18.s, p2/M, z18.s, z1.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "st1b { z12.b }, p1, [x24]\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "st1b { z12.b }, p1, [x26]\n"
+      "smax z19.s, p2/M, z19.s, z1.s\n"
+      "smax z20.s, p2/M, z20.s, z1.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z1.s\n"
+      "smax z22.s, p2/M, z22.s, z1.s\n"
       "uzp1 z20.h, z20.h, z21.h\n"
-      "st1b { z16.b }, p1, [x23]\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "st1b { z16.b }, p1, [x25]\n"
+      "smax z23.s, p2/M, z23.s, z1.s\n"
+      "smax z24.s, p2/M, z24.s, z1.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z1.s\n"
+      "smax z26.s, p2/M, z26.s, z1.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
-      "st1b { z20.b }, p1, [x22]\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "smax z28.s, p2/M, z28.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "smax z29.s, p2/M, z29.s, z5.s\n"
-      "smax z30.s, p2/M, z30.s, z5.s\n"
+      "st1b { z20.b }, p1, [x24]\n"
+      "smax z27.s, p2/M, z27.s, z1.s\n"
+      "smax z28.s, p2/M, z28.s, z1.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "smax z29.s, p2/M, z29.s, z1.s\n"
+      "smax z30.s, p2/M, z30.s, z1.s\n"
       "uzp1 z28.h, z28.h, z29.h\n"
-      "st1b { z24.b }, p1, [x21]\n"
-      "smax z31.s, p2/M, z31.s, z5.s\n"
-      "uzp1 z29.h, z30.h, z31.h\n"
-      "uzp1 z28.b, z28.b, z29.b\n"
-      "st1b { z28.b }, p1, [x20]\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "smax z31.s, p2/M, z31.s, z1.s\n"
+      "uzp1 z16.h, z30.h, z31.h\n"
+      "uzp1 z28.b, z28.b, z16.b\n"
+      "st1b { z28.b }, p1, [x22]\n"
       "addvl x11, x11, #1\n"
       "78:"  // Height 6: Writeback done
       "decw x10, ALL, MUL #4\n"
@@ -2594,7 +2594,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "80:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -2602,4 +2601,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp
index 2b7ad8b..b1b1135 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -74,7 +74,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, int8_t>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -97,5 +96,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp
index 6041794..cd5f854 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp
@@ -117,11 +117,11 @@
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -133,86 +133,86 @@
       "ble 8f\n"
       "7:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "ld1rqb { z20.b }, p0/Z, [x26]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #16\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45119a88  // smmla z8.s, z20.b, z17.b\n"
+      ".inst 0x45109a8c  // smmla z12.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45119a89  // smmla z9.s, z20.b, z17.b\n"
+      ".inst 0x45109a8d  // smmla z13.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x45119a8a  // smmla z10.s, z20.b, z17.b\n"
+      ".inst 0x45109a8e  // smmla z14.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
       "sub x27, x27, #0x10\n"
       "cmp x27, #0x10\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45119a8b  // smmla z11.s, z20.b, z17.b\n"
+      ".inst 0x45109a8f  // smmla z15.s, z20.b, z16.b\n"
       "add x26, x26, #0x10\n"
       "bgt 7b\n"
       "8:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
       "addvl x9, x9, #8\n"
       "ble 9f\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45119828  // smmla z8.s, z1.b, z17.b\n"
+      ".inst 0x4510982c  // smmla z12.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45119829  // smmla z9.s, z1.b, z17.b\n"
+      ".inst 0x4510982d  // smmla z13.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4511982a  // smmla z10.s, z1.b, z17.b\n"
+      ".inst 0x4510982e  // smmla z14.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x4511982b  // smmla z11.s, z1.b, z17.b\n"
+      ".inst 0x4510982f  // smmla z15.s, z1.b, z16.b\n"
       "addvl x9, x9, #8\n"
       "9:"  // Height 1: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -221,18 +221,18 @@
       "bne 4b\n"
       "uzp1 z8.d, z8.d, z12.d\n"
       "uzp1 z9.d, z9.d, z13.d\n"
-      "ld1w { z0.s }, p2/Z, [x14]\n"
-      "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x14]\n"
+      "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n"
       "uzp1 z10.d, z10.d, z14.d\n"
       "uzp1 z11.d, z11.d, z15.d\n"
-      "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
       "mov z15.d, z8.d\n"
-      "add z15.s, z15.s, z0.s\n"
+      "add z15.s, z15.s, z19.s\n"
       "addvl x14, x14, #4\n"
-      "add z9.s, z9.s, z1.s\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
+      "add z9.s, z9.s, z18.s\n"
+      "add z10.s, z10.s, z17.s\n"
+      "add z11.s, z11.s, z16.s\n"
       "tbz %x[flags], #4, 10f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -246,10 +246,10 @@
       "addvl x13, x13, #4\n"
       "b 11f\n"
       "10:"  // Height 1: per layer parameters
-      "add x26, %x[qp], %[per_layer_right_shift]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z0.s }, p2/Z, [x26]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
       "mov z5.d, z4.d\n"
       "mov z2.d, z0.d\n"
@@ -262,44 +262,44 @@
       ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
       ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
       "tbz %x[flags], #5, 12f\n"
-      "and z4.d, z15.d, z0.d\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z15.s, z15.s, z4.s\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "sqadd z11.s, z11.s, z7.s\n"
+      "and z19.d, z15.d, z0.d\n"
+      "and z18.d, z9.d, z1.d\n"
+      "and z17.d, z10.d, z2.d\n"
+      "and z16.d, z11.d, z3.d\n"
+      "asr z19.s, z19.s, #0x1f\n"
+      "asr z18.s, z18.s, #0x1f\n"
+      "asr z17.s, z17.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z19.s\n"
+      "sqadd z9.s, z9.s, z18.s\n"
+      "sqadd z10.s, z10.s, z17.s\n"
+      "sqadd z11.s, z11.s, z16.s\n"
       "12:"  // Height 1: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
       ".inst 0x4482880f  // srshl z15.s, p2/M, z15.s, z0.s\n"
-      "add z15.s, z15.s, z4.s\n"
+      "add z15.s, z15.s, z17.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
+      "add z9.s, z9.s, z17.s\n"
+      "add z10.s, z10.s, z17.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x25]\n"
-      "add z11.s, z11.s, z4.s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x25]\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "add z11.s, z11.s, z17.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z31.s }, p2/Z, [x20]\n"
+      "smin z15.s, p2/M, z15.s, z16.s\n"
+      "smin z9.s, p2/M, z9.s, z16.s\n"
+      "smin z10.s, p2/M, z10.s, z16.s\n"
+      "smin z11.s, p2/M, z11.s, z16.s\n"
+      "smax z15.s, p2/M, z15.s, z31.s\n"
+      "smax z9.s, p2/M, z9.s, z31.s\n"
+      "smax z10.s, p2/M, z10.s, z31.s\n"
       "uzp1 z15.h, z15.h, z9.h\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z15.b, z15.b, z9.b\n"
+      "smax z11.s, p2/M, z11.s, z31.s\n"
+      "uzp1 z16.h, z10.h, z11.h\n"
+      "uzp1 z15.b, z15.b, z16.b\n"
       "st1b { z15.b }, p1, [x11]\n"
       "addvl x11, x11, #1\n"
       "13:"  // Height 1: Writeback done
@@ -330,12 +330,12 @@
       "17:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 18f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 19f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -343,125 +343,125 @@
       "b 19f\n"
       "18:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
+      "add x25, x26, x21\n"
       "19:"  // Height 2: input setup done
       "cmp x27, #0x10\n"
       "ble 21f\n"
       "20:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "ld1rqb { z20.b }, p0/Z, [x26]\n"
+      "ld1rqb { z19.b }, p0/Z, [x25]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #16\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45119a88  // smmla z8.s, z20.b, z17.b\n"
+      ".inst 0x45109a8c  // smmla z12.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45119a89  // smmla z9.s, z20.b, z17.b\n"
+      ".inst 0x45109a8d  // smmla z13.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x45119a8a  // smmla z10.s, z20.b, z17.b\n"
+      ".inst 0x45109a8e  // smmla z14.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n"
       "sub x27, x27, #0x10\n"
       "cmp x27, #0x10\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45119a8b  // smmla z11.s, z20.b, z17.b\n"
+      ".inst 0x45109a8f  // smmla z15.s, z20.b, z16.b\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "bgt 20b\n"
       "21:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "ld1rqb { z19.b }, p0/Z, [x25]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
       "addvl x9, x9, #8\n"
       "ble 22f\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      "ld1b { z17.b }, p2/Z, [x9]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45119828  // smmla z8.s, z1.b, z17.b\n"
+      ".inst 0x4510982c  // smmla z12.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45119829  // smmla z9.s, z1.b, z17.b\n"
+      ".inst 0x4510982d  // smmla z13.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4511982a  // smmla z10.s, z1.b, z17.b\n"
+      ".inst 0x4510982e  // smmla z14.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n"
+      "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x4511982b  // smmla z11.s, z1.b, z17.b\n"
+      ".inst 0x4510982f  // smmla z15.s, z1.b, z16.b\n"
       "addvl x9, x9, #8\n"
       "22:"  // Height 2: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 17b\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
+      "uzp1 z20.d, z8.d, z12.d\n"
       "uzp2 z8.d, z8.d, z12.d\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "ld1w { z0.s }, p2/Z, [x14]\n"
+      "ld1w { z19.s }, p2/Z, [x14]\n"
       "uzp1 z12.d, z9.d, z13.d\n"
       "uzp2 z9.d, z9.d, z13.d\n"
-      "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n"
       "uzp1 z13.d, z10.d, z14.d\n"
       "uzp2 z10.d, z10.d, z14.d\n"
-      "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
-      "add x24, x11, x20\n"
+      "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add x26, x11, x20\n"
       "uzp1 z14.d, z11.d, z15.d\n"
       "uzp2 z11.d, z11.d, z15.d\n"
       "addvl x14, x14, #4\n"
-      "mov z15.d, z7.d\n"
-      "add z15.s, z15.s, z0.s\n"
-      "add z12.s, z12.s, z1.s\n"
-      "add z13.s, z13.s, z2.s\n"
-      "add z14.s, z14.s, z3.s\n"
-      "add z8.s, z8.s, z0.s\n"
-      "add z9.s, z9.s, z1.s\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
+      "mov z15.d, z20.d\n"
+      "add z15.s, z15.s, z19.s\n"
+      "add z12.s, z12.s, z18.s\n"
+      "add z13.s, z13.s, z17.s\n"
+      "add z14.s, z14.s, z16.s\n"
+      "add z8.s, z8.s, z19.s\n"
+      "add z9.s, z9.s, z18.s\n"
+      "add z10.s, z10.s, z17.s\n"
+      "add z11.s, z11.s, z16.s\n"
       "tbz %x[flags], #4, 23f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -475,10 +475,10 @@
       "addvl x13, x13, #4\n"
       "b 24f\n"
       "23:"  // Height 2: per layer parameters
-      "add x26, %x[qp], %[per_layer_right_shift]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z0.s }, p2/Z, [x26]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
       "mov z5.d, z4.d\n"
       "mov z2.d, z0.d\n"
@@ -495,77 +495,77 @@
       ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
       ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
       "tbz %x[flags], #5, 25f\n"
-      "and z4.d, z15.d, z0.d\n"
-      "and z5.d, z12.d, z1.d\n"
-      "and z6.d, z13.d, z2.d\n"
-      "and z7.d, z14.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z15.s, z15.s, z4.s\n"
-      "sqadd z12.s, z12.s, z5.s\n"
-      "sqadd z13.s, z13.s, z6.s\n"
-      "sqadd z14.s, z14.s, z7.s\n"
-      "and z4.d, z8.d, z0.d\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "sqadd z11.s, z11.s, z7.s\n"
+      "and z19.d, z15.d, z0.d\n"
+      "and z18.d, z12.d, z1.d\n"
+      "and z17.d, z13.d, z2.d\n"
+      "and z16.d, z14.d, z3.d\n"
+      "asr z19.s, z19.s, #0x1f\n"
+      "asr z18.s, z18.s, #0x1f\n"
+      "asr z17.s, z17.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z19.s\n"
+      "sqadd z12.s, z12.s, z18.s\n"
+      "sqadd z13.s, z13.s, z17.s\n"
+      "sqadd z14.s, z14.s, z16.s\n"
+      "and z18.d, z8.d, z0.d\n"
+      "and z24.d, z9.d, z1.d\n"
+      "and z17.d, z10.d, z2.d\n"
+      "and z16.d, z11.d, z3.d\n"
+      "asr z18.s, z18.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "asr z17.s, z17.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z18.s\n"
+      "sqadd z9.s, z9.s, z24.s\n"
+      "sqadd z10.s, z10.s, z17.s\n"
+      "sqadd z11.s, z11.s, z16.s\n"
       "25:"  // Height 2: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
       ".inst 0x4482880f  // srshl z15.s, p2/M, z15.s, z0.s\n"
-      "add z15.s, z15.s, z4.s\n"
+      "add z15.s, z15.s, z17.s\n"
       ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
       ".inst 0x4482884d  // srshl z13.s, p2/M, z13.s, z2.s\n"
-      "add z12.s, z12.s, z4.s\n"
-      "add z13.s, z13.s, z4.s\n"
+      "add z12.s, z12.s, z17.s\n"
+      "add z13.s, z13.s, z17.s\n"
       ".inst 0x4482886e  // srshl z14.s, p2/M, z14.s, z3.s\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add z14.s, z14.s, z4.s\n"
-      "add z8.s, z8.s, z4.s\n"
+      "add z14.s, z14.s, z17.s\n"
+      "add z8.s, z8.s, z17.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
+      "add z9.s, z9.s, z17.s\n"
+      "add z10.s, z10.s, z17.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x25]\n"
-      "add z11.s, z11.s, z4.s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x25]\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "add z11.s, z11.s, z17.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z17.s }, p2/Z, [x20]\n"
+      "smin z15.s, p2/M, z15.s, z16.s\n"
+      "smin z12.s, p2/M, z12.s, z16.s\n"
+      "smin z13.s, p2/M, z13.s, z16.s\n"
+      "smin z14.s, p2/M, z14.s, z16.s\n"
+      "smin z8.s, p2/M, z8.s, z16.s\n"
+      "smin z9.s, p2/M, z9.s, z16.s\n"
+      "smin z10.s, p2/M, z10.s, z16.s\n"
+      "smin z11.s, p2/M, z11.s, z16.s\n"
+      "smax z15.s, p2/M, z15.s, z17.s\n"
+      "smax z12.s, p2/M, z12.s, z17.s\n"
+      "smax z13.s, p2/M, z13.s, z17.s\n"
       "uzp1 z15.h, z15.h, z12.h\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "uzp1 z12.h, z13.h, z14.h\n"
-      "uzp1 z15.b, z15.b, z12.b\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z14.s, p2/M, z14.s, z17.s\n"
+      "smax z8.s, p2/M, z8.s, z17.s\n"
+      "uzp1 z16.h, z13.h, z14.h\n"
+      "uzp1 z15.b, z15.b, z16.b\n"
+      "smax z9.s, p2/M, z9.s, z17.s\n"
+      "smax z10.s, p2/M, z10.s, z17.s\n"
       "uzp1 z8.h, z8.h, z9.h\n"
       "st1b { z15.b }, p1, [x11]\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "st1b { z8.b }, p1, [x24]\n"
+      "smax z11.s, p2/M, z11.s, z17.s\n"
+      "uzp1 z16.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z16.b\n"
+      "st1b { z8.b }, p1, [x26]\n"
       "addvl x11, x11, #1\n"
       "26:"  // Height 2: Writeback done
       "decw x10, ALL, MUL #4\n"
@@ -603,13 +603,13 @@
       "30:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 31f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 32f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -618,174 +618,174 @@
       "b 32f\n"
       "31:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "32:"  // Height 3: input setup done
       "cmp x27, #0x10\n"
       "ble 34f\n"
       "33:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "ld1rqb { z30.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "ld1rqb { z28.b }, p0/Z, [x24]\n"
+      "trn1 z27.d, z30.d, z24.d\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "trn1 z26.d, z28.d, z29.d\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45199b68  // smmla z8.s, z27.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189b6c  // smmla z12.s, z27.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45199b69  // smmla z9.s, z27.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z29.d\n"
+      ".inst 0x45189b6d  // smmla z13.s, z27.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45199b6a  // smmla z10.s, z27.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
       "cmp x27, #0x10\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x45189b6e  // smmla z14.s, z27.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #16\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x45199b6b  // smmla z11.s, z27.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45189b6f  // smmla z15.s, z27.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x45199bc8  // smmla z8.s, z30.b, z25.b\n"
+      ".inst 0x45199b90  // smmla z16.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x45189bcc  // smmla z12.s, z30.b, z24.b\n"
+      ".inst 0x45189b94  // smmla z20.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45199bc9  // smmla z9.s, z30.b, z25.b\n"
+      ".inst 0x45199b91  // smmla z17.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x45189bcd  // smmla z13.s, z30.b, z24.b\n"
+      ".inst 0x45189b95  // smmla z21.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x45199bca  // smmla z10.s, z30.b, z25.b\n"
+      ".inst 0x45199b92  // smmla z18.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x45189bce  // smmla z14.s, z30.b, z24.b\n"
+      ".inst 0x45189b96  // smmla z22.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x45199bcb  // smmla z11.s, z30.b, z25.b\n"
+      ".inst 0x45199b93  // smmla z19.s, z28.b, z25.b\n"
+      ".inst 0x45189bcf  // smmla z15.s, z30.b, z24.b\n"
+      ".inst 0x45189b97  // smmla z23.s, z28.b, z24.b\n"
       "bgt 33b\n"
       "34:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
       "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "trn1 z27.d, z1.d, z24.d\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "trn1 z26.d, z3.d, z28.d\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45199b68  // smmla z8.s, z27.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189b6c  // smmla z12.s, z27.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45199b69  // smmla z9.s, z27.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x45189b6d  // smmla z13.s, z27.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z28.d\n"
+      ".inst 0x45199b6a  // smmla z10.s, z27.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45189b6e  // smmla z14.s, z27.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #8\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      ".inst 0x45199b6b  // smmla z11.s, z27.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      ".inst 0x45189b6f  // smmla z15.s, z27.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
       "ble 35f\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45199828  // smmla z8.s, z1.b, z25.b\n"
+      ".inst 0x45199870  // smmla z16.s, z3.b, z25.b\n"
+      ".inst 0x4518982c  // smmla z12.s, z1.b, z24.b\n"
+      ".inst 0x45189874  // smmla z20.s, z3.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45199829  // smmla z9.s, z1.b, z25.b\n"
+      ".inst 0x45199871  // smmla z17.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4518982d  // smmla z13.s, z1.b, z24.b\n"
+      ".inst 0x45189875  // smmla z21.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4519982a  // smmla z10.s, z1.b, z25.b\n"
+      ".inst 0x45199872  // smmla z18.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4518982e  // smmla z14.s, z1.b, z24.b\n"
+      ".inst 0x45189876  // smmla z22.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #8\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x4519982b  // smmla z11.s, z1.b, z25.b\n"
+      ".inst 0x45199873  // smmla z19.s, z3.b, z25.b\n"
+      ".inst 0x4518982f  // smmla z15.s, z1.b, z24.b\n"
+      ".inst 0x45189877  // smmla z23.s, z3.b, z24.b\n"
       "35:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 30b\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
+      "uzp1 z28.d, z8.d, z12.d\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "uzp2 z8.d, z8.d, z12.d\n"
-      "ld1w { z0.s }, p2/Z, [x14]\n"
+      "ld1w { z27.s }, p2/Z, [x14]\n"
       "uzp1 z12.d, z9.d, z13.d\n"
       "uzp2 z9.d, z9.d, z13.d\n"
-      "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n"
       "uzp1 z13.d, z10.d, z14.d\n"
       "uzp2 z10.d, z10.d, z14.d\n"
-      "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
-      "add x24, x11, x20\n"
+      "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add x26, x11, x20\n"
       "uzp1 z14.d, z11.d, z15.d\n"
       "uzp2 z11.d, z11.d, z15.d\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x20\n"
       "addvl x14, x14, #4\n"
       "uzp1 z16.d, z16.d, z20.d\n"
       "uzp1 z17.d, z17.d, z21.d\n"
       "uzp1 z18.d, z18.d, z22.d\n"
       "uzp1 z19.d, z19.d, z23.d\n"
-      "mov z23.d, z7.d\n"
-      "add z23.s, z23.s, z0.s\n"
-      "add z12.s, z12.s, z1.s\n"
-      "add z13.s, z13.s, z2.s\n"
-      "add z14.s, z14.s, z3.s\n"
-      "add z8.s, z8.s, z0.s\n"
-      "add z9.s, z9.s, z1.s\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
+      "mov z23.d, z28.d\n"
+      "add z23.s, z23.s, z27.s\n"
+      "add z12.s, z12.s, z26.s\n"
+      "add z13.s, z13.s, z25.s\n"
+      "add z14.s, z14.s, z24.s\n"
+      "add z8.s, z8.s, z27.s\n"
+      "add z9.s, z9.s, z26.s\n"
+      "add z10.s, z10.s, z25.s\n"
+      "add z11.s, z11.s, z24.s\n"
+      "add z16.s, z16.s, z27.s\n"
+      "add z17.s, z17.s, z26.s\n"
+      "add z18.s, z18.s, z25.s\n"
+      "add z19.s, z19.s, z24.s\n"
       "tbz %x[flags], #4, 36f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -799,10 +799,10 @@
       "addvl x13, x13, #4\n"
       "b 37f\n"
       "36:"  // Height 3: per layer parameters
-      "add x26, %x[qp], %[per_layer_right_shift]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z0.s }, p2/Z, [x26]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
       "mov z5.d, z4.d\n"
       "mov z2.d, z0.d\n"
@@ -823,109 +823,109 @@
       ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
       ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
       "tbz %x[flags], #5, 38f\n"
-      "and z4.d, z23.d, z0.d\n"
-      "and z5.d, z12.d, z1.d\n"
-      "and z6.d, z13.d, z2.d\n"
-      "and z7.d, z14.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "sqadd z12.s, z12.s, z5.s\n"
-      "sqadd z13.s, z13.s, z6.s\n"
-      "sqadd z14.s, z14.s, z7.s\n"
-      "and z4.d, z8.d, z0.d\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "sqadd z11.s, z11.s, z7.s\n"
-      "and z4.d, z16.d, z0.d\n"
-      "and z5.d, z17.d, z1.d\n"
-      "and z6.d, z18.d, z2.d\n"
-      "and z7.d, z19.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
+      "and z24.d, z23.d, z0.d\n"
+      "and z22.d, z12.d, z1.d\n"
+      "and z21.d, z13.d, z2.d\n"
+      "and z20.d, z14.d, z3.d\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z24.s\n"
+      "sqadd z12.s, z12.s, z22.s\n"
+      "sqadd z13.s, z13.s, z21.s\n"
+      "sqadd z14.s, z14.s, z20.s\n"
+      "and z24.d, z8.d, z0.d\n"
+      "and z22.d, z9.d, z1.d\n"
+      "and z21.d, z10.d, z2.d\n"
+      "and z20.d, z11.d, z3.d\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z24.s\n"
+      "sqadd z9.s, z9.s, z22.s\n"
+      "sqadd z10.s, z10.s, z21.s\n"
+      "sqadd z11.s, z11.s, z20.s\n"
+      "and z24.d, z16.d, z0.d\n"
+      "and z22.d, z17.d, z1.d\n"
+      "and z21.d, z18.d, z2.d\n"
+      "and z20.d, z19.d, z3.d\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z24.s\n"
+      "sqadd z17.s, z17.s, z22.s\n"
+      "sqadd z18.s, z18.s, z21.s\n"
+      "sqadd z19.s, z19.s, z20.s\n"
       "38:"  // Height 3: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
-      "add z23.s, z23.s, z4.s\n"
+      "add z23.s, z23.s, z21.s\n"
       ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
       ".inst 0x4482884d  // srshl z13.s, p2/M, z13.s, z2.s\n"
-      "add z12.s, z12.s, z4.s\n"
-      "add z13.s, z13.s, z4.s\n"
+      "add z12.s, z12.s, z21.s\n"
+      "add z13.s, z13.s, z21.s\n"
       ".inst 0x4482886e  // srshl z14.s, p2/M, z14.s, z3.s\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add z14.s, z14.s, z4.s\n"
-      "add z8.s, z8.s, z4.s\n"
+      "add z14.s, z14.s, z21.s\n"
+      "add z8.s, z8.s, z21.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
+      "add z9.s, z9.s, z21.s\n"
+      "add z10.s, z10.s, z21.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z11.s, z11.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z11.s, z11.s, z21.s\n"
+      "add z16.s, z16.s, z21.s\n"
       ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
       ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z21.s\n"
+      "add z18.s, z18.s, z21.s\n"
       ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x25]\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x25]\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z21.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      "smin z23.s, p2/M, z23.s, z20.s\n"
+      "smin z12.s, p2/M, z12.s, z20.s\n"
+      "smin z13.s, p2/M, z13.s, z20.s\n"
+      "smin z14.s, p2/M, z14.s, z20.s\n"
+      "smin z8.s, p2/M, z8.s, z20.s\n"
+      "smin z9.s, p2/M, z9.s, z20.s\n"
+      "smin z10.s, p2/M, z10.s, z20.s\n"
+      "smin z11.s, p2/M, z11.s, z20.s\n"
+      "smin z16.s, p2/M, z16.s, z20.s\n"
+      "smin z17.s, p2/M, z17.s, z20.s\n"
+      "smin z18.s, p2/M, z18.s, z20.s\n"
+      "smin z19.s, p2/M, z19.s, z20.s\n"
+      "smax z23.s, p2/M, z23.s, z21.s\n"
+      "smax z12.s, p2/M, z12.s, z21.s\n"
+      "smax z13.s, p2/M, z13.s, z21.s\n"
       "uzp1 z23.h, z23.h, z12.h\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "uzp1 z12.h, z13.h, z14.h\n"
-      "uzp1 z23.b, z23.b, z12.b\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z14.s, p2/M, z14.s, z21.s\n"
+      "smax z8.s, p2/M, z8.s, z21.s\n"
+      "uzp1 z20.h, z13.h, z14.h\n"
+      "uzp1 z23.b, z23.b, z20.b\n"
+      "smax z9.s, p2/M, z9.s, z21.s\n"
+      "smax z10.s, p2/M, z10.s, z21.s\n"
       "uzp1 z8.h, z8.h, z9.h\n"
       "st1b { z23.b }, p1, [x11]\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z21.s\n"
+      "smax z16.s, p2/M, z16.s, z21.s\n"
+      "uzp1 z20.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z20.b\n"
+      "smax z17.s, p2/M, z17.s, z21.s\n"
+      "smax z18.s, p2/M, z18.s, z21.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "st1b { z8.b }, p1, [x24]\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "st1b { z8.b }, p1, [x26]\n"
+      "smax z19.s, p2/M, z19.s, z21.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x23]\n"
+      "st1b { z16.b }, p1, [x25]\n"
       "addvl x11, x11, #1\n"
       "39:"  // Height 3: Writeback done
       "decw x10, ALL, MUL #4\n"
@@ -963,14 +963,14 @@
       "43:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 44f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 45f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -980,161 +980,161 @@
       "b 45f\n"
       "44:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "45:"  // Height 4: input setup done
       "cmp x27, #0x10\n"
       "ble 47f\n"
       "46:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "ld1rqb { z30.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "trn1 z29.d, z30.d, z24.d\n"
+      "ld1rqb { z28.b }, p0/Z, [x24]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "trn1 z26.d, z28.d, z27.d\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45199ba8  // smmla z8.s, z29.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189bac  // smmla z12.s, z29.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45199ba9  // smmla z9.s, z29.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z27.d\n"
+      ".inst 0x45189bad  // smmla z13.s, z29.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45199baa  // smmla z10.s, z29.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
       "cmp x27, #0x10\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x45189bae  // smmla z14.s, z29.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #16\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x45199bab  // smmla z11.s, z29.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45189baf  // smmla z15.s, z29.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x45199bc8  // smmla z8.s, z30.b, z25.b\n"
+      ".inst 0x45199b90  // smmla z16.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45189bcc  // smmla z12.s, z30.b, z24.b\n"
+      ".inst 0x45189b94  // smmla z20.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x45199bc9  // smmla z9.s, z30.b, z25.b\n"
+      ".inst 0x45199b91  // smmla z17.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x45189bcd  // smmla z13.s, z30.b, z24.b\n"
+      ".inst 0x45189b95  // smmla z21.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x45199bca  // smmla z10.s, z30.b, z25.b\n"
+      ".inst 0x45199b92  // smmla z18.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x45189bce  // smmla z14.s, z30.b, z24.b\n"
+      ".inst 0x45189b96  // smmla z22.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x45199bcb  // smmla z11.s, z30.b, z25.b\n"
+      ".inst 0x45199b93  // smmla z19.s, z28.b, z25.b\n"
+      ".inst 0x45189bcf  // smmla z15.s, z30.b, z24.b\n"
+      ".inst 0x45189b97  // smmla z23.s, z28.b, z24.b\n"
       "bgt 46b\n"
       "47:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "trn1 z28.d, z1.d, z24.d\n"
       "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "trn1 z26.d, z3.d, z27.d\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45199b88  // smmla z8.s, z28.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189b8c  // smmla z12.s, z28.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45199b89  // smmla z9.s, z28.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x45189b8d  // smmla z13.s, z28.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z27.d\n"
+      ".inst 0x45199b8a  // smmla z10.s, z28.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45189b8e  // smmla z14.s, z28.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #8\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      ".inst 0x45199b8b  // smmla z11.s, z28.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      ".inst 0x45189b8f  // smmla z15.s, z28.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
       "ble 48f\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x9]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45199828  // smmla z8.s, z1.b, z25.b\n"
+      ".inst 0x45199870  // smmla z16.s, z3.b, z25.b\n"
+      ".inst 0x4518982c  // smmla z12.s, z1.b, z24.b\n"
+      ".inst 0x45189874  // smmla z20.s, z3.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45199829  // smmla z9.s, z1.b, z25.b\n"
+      ".inst 0x45199871  // smmla z17.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4518982d  // smmla z13.s, z1.b, z24.b\n"
+      ".inst 0x45189875  // smmla z21.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4519982a  // smmla z10.s, z1.b, z25.b\n"
+      ".inst 0x45199872  // smmla z18.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4518982e  // smmla z14.s, z1.b, z24.b\n"
+      ".inst 0x45189876  // smmla z22.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #8\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x4519982b  // smmla z11.s, z1.b, z25.b\n"
+      ".inst 0x45199873  // smmla z19.s, z3.b, z25.b\n"
+      ".inst 0x4518982f  // smmla z15.s, z1.b, z24.b\n"
+      ".inst 0x45189877  // smmla z23.s, z3.b, z24.b\n"
       "48:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 43b\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
+      "uzp1 z28.d, z8.d, z12.d\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "uzp2 z8.d, z8.d, z12.d\n"
-      "ld1w { z0.s }, p2/Z, [x14]\n"
+      "ld1w { z27.s }, p2/Z, [x14]\n"
       "uzp1 z12.d, z9.d, z13.d\n"
       "uzp2 z9.d, z9.d, z13.d\n"
-      "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n"
       "uzp1 z13.d, z10.d, z14.d\n"
       "uzp2 z10.d, z10.d, z14.d\n"
-      "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
-      "add x24, x11, x20\n"
+      "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add x26, x11, x20\n"
       "uzp1 z14.d, z11.d, z15.d\n"
       "uzp2 z11.d, z11.d, z15.d\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x20\n"
+      "add x24, x25, x20\n"
       "uzp1 z15.d, z16.d, z20.d\n"
       "uzp2 z16.d, z16.d, z20.d\n"
       "addvl x14, x14, #4\n"
@@ -1144,23 +1144,23 @@
       "uzp2 z18.d, z18.d, z22.d\n"
       "uzp1 z22.d, z19.d, z23.d\n"
       "uzp2 z19.d, z19.d, z23.d\n"
-      "mov z23.d, z7.d\n"
-      "add z23.s, z23.s, z0.s\n"
-      "add z12.s, z12.s, z1.s\n"
-      "add z13.s, z13.s, z2.s\n"
-      "add z14.s, z14.s, z3.s\n"
-      "add z8.s, z8.s, z0.s\n"
-      "add z9.s, z9.s, z1.s\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
-      "add z15.s, z15.s, z0.s\n"
-      "add z20.s, z20.s, z1.s\n"
-      "add z21.s, z21.s, z2.s\n"
-      "add z22.s, z22.s, z3.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
+      "mov z23.d, z28.d\n"
+      "add z23.s, z23.s, z27.s\n"
+      "add z12.s, z12.s, z26.s\n"
+      "add z13.s, z13.s, z25.s\n"
+      "add z14.s, z14.s, z24.s\n"
+      "add z8.s, z8.s, z27.s\n"
+      "add z9.s, z9.s, z26.s\n"
+      "add z10.s, z10.s, z25.s\n"
+      "add z11.s, z11.s, z24.s\n"
+      "add z15.s, z15.s, z27.s\n"
+      "add z20.s, z20.s, z26.s\n"
+      "add z21.s, z21.s, z25.s\n"
+      "add z22.s, z22.s, z24.s\n"
+      "add z16.s, z16.s, z27.s\n"
+      "add z17.s, z17.s, z26.s\n"
+      "add z18.s, z18.s, z25.s\n"
+      "add z19.s, z19.s, z24.s\n"
       "tbz %x[flags], #4, 49f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -1174,10 +1174,10 @@
       "addvl x13, x13, #4\n"
       "b 50f\n"
       "49:"  // Height 4: per layer parameters
-      "add x26, %x[qp], %[per_layer_right_shift]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z0.s }, p2/Z, [x26]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
       "mov z5.d, z4.d\n"
       "mov z2.d, z0.d\n"
@@ -1202,141 +1202,141 @@
       ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
       ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
       "tbz %x[flags], #5, 51f\n"
-      "and z4.d, z23.d, z0.d\n"
-      "and z5.d, z12.d, z1.d\n"
-      "and z6.d, z13.d, z2.d\n"
-      "and z7.d, z14.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "sqadd z12.s, z12.s, z5.s\n"
-      "sqadd z13.s, z13.s, z6.s\n"
-      "sqadd z14.s, z14.s, z7.s\n"
-      "and z4.d, z8.d, z0.d\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "sqadd z11.s, z11.s, z7.s\n"
-      "and z4.d, z15.d, z0.d\n"
-      "and z5.d, z20.d, z1.d\n"
-      "and z6.d, z21.d, z2.d\n"
-      "and z7.d, z22.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z15.s, z15.s, z4.s\n"
-      "sqadd z20.s, z20.s, z5.s\n"
-      "sqadd z21.s, z21.s, z6.s\n"
-      "sqadd z22.s, z22.s, z7.s\n"
-      "and z4.d, z16.d, z0.d\n"
-      "and z5.d, z17.d, z1.d\n"
-      "and z6.d, z18.d, z2.d\n"
-      "and z7.d, z19.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
+      "and z27.d, z23.d, z0.d\n"
+      "and z26.d, z12.d, z1.d\n"
+      "and z25.d, z13.d, z2.d\n"
+      "and z24.d, z14.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z27.s\n"
+      "sqadd z12.s, z12.s, z26.s\n"
+      "sqadd z13.s, z13.s, z25.s\n"
+      "sqadd z14.s, z14.s, z24.s\n"
+      "and z27.d, z8.d, z0.d\n"
+      "and z26.d, z9.d, z1.d\n"
+      "and z25.d, z10.d, z2.d\n"
+      "and z24.d, z11.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z27.s\n"
+      "sqadd z9.s, z9.s, z26.s\n"
+      "sqadd z10.s, z10.s, z25.s\n"
+      "sqadd z11.s, z11.s, z24.s\n"
+      "and z27.d, z15.d, z0.d\n"
+      "and z26.d, z20.d, z1.d\n"
+      "and z25.d, z21.d, z2.d\n"
+      "and z24.d, z22.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z27.s\n"
+      "sqadd z20.s, z20.s, z26.s\n"
+      "sqadd z21.s, z21.s, z25.s\n"
+      "sqadd z22.s, z22.s, z24.s\n"
+      "and z27.d, z16.d, z0.d\n"
+      "and z26.d, z17.d, z1.d\n"
+      "and z25.d, z18.d, z2.d\n"
+      "and z24.d, z19.d, z3.d\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z27.s\n"
+      "sqadd z17.s, z17.s, z26.s\n"
+      "sqadd z18.s, z18.s, z25.s\n"
+      "sqadd z19.s, z19.s, z24.s\n"
       "51:"  // Height 4: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
-      "add z23.s, z23.s, z4.s\n"
+      "add z23.s, z23.s, z25.s\n"
       ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
       ".inst 0x4482884d  // srshl z13.s, p2/M, z13.s, z2.s\n"
-      "add z12.s, z12.s, z4.s\n"
-      "add z13.s, z13.s, z4.s\n"
+      "add z12.s, z12.s, z25.s\n"
+      "add z13.s, z13.s, z25.s\n"
       ".inst 0x4482886e  // srshl z14.s, p2/M, z14.s, z3.s\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add z14.s, z14.s, z4.s\n"
-      "add z8.s, z8.s, z4.s\n"
+      "add z14.s, z14.s, z25.s\n"
+      "add z8.s, z8.s, z25.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
+      "add z9.s, z9.s, z25.s\n"
+      "add z10.s, z10.s, z25.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
       ".inst 0x4482880f  // srshl z15.s, p2/M, z15.s, z0.s\n"
-      "add z11.s, z11.s, z4.s\n"
-      "add z15.s, z15.s, z4.s\n"
+      "add z11.s, z11.s, z25.s\n"
+      "add z15.s, z15.s, z25.s\n"
       ".inst 0x44828834  // srshl z20.s, p2/M, z20.s, z1.s\n"
       ".inst 0x44828855  // srshl z21.s, p2/M, z21.s, z2.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "add z21.s, z21.s, z4.s\n"
+      "add z20.s, z20.s, z25.s\n"
+      "add z21.s, z21.s, z25.s\n"
       ".inst 0x44828876  // srshl z22.s, p2/M, z22.s, z3.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z22.s, z22.s, z25.s\n"
+      "add z16.s, z16.s, z25.s\n"
       ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
       ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z25.s\n"
+      "add z18.s, z18.s, z25.s\n"
       ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x25]\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x25]\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z25.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
+      "smin z23.s, p2/M, z23.s, z24.s\n"
+      "smin z12.s, p2/M, z12.s, z24.s\n"
+      "smin z13.s, p2/M, z13.s, z24.s\n"
+      "smin z14.s, p2/M, z14.s, z24.s\n"
+      "smin z8.s, p2/M, z8.s, z24.s\n"
+      "smin z9.s, p2/M, z9.s, z24.s\n"
+      "smin z10.s, p2/M, z10.s, z24.s\n"
+      "smin z11.s, p2/M, z11.s, z24.s\n"
+      "smin z15.s, p2/M, z15.s, z24.s\n"
+      "smin z20.s, p2/M, z20.s, z24.s\n"
+      "smin z21.s, p2/M, z21.s, z24.s\n"
+      "smin z22.s, p2/M, z22.s, z24.s\n"
+      "smin z16.s, p2/M, z16.s, z24.s\n"
+      "smin z17.s, p2/M, z17.s, z24.s\n"
+      "smin z18.s, p2/M, z18.s, z24.s\n"
+      "smin z19.s, p2/M, z19.s, z24.s\n"
+      "smax z23.s, p2/M, z23.s, z25.s\n"
+      "smax z12.s, p2/M, z12.s, z25.s\n"
+      "smax z13.s, p2/M, z13.s, z25.s\n"
       "uzp1 z23.h, z23.h, z12.h\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "uzp1 z12.h, z13.h, z14.h\n"
-      "uzp1 z23.b, z23.b, z12.b\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z14.s, p2/M, z14.s, z25.s\n"
+      "smax z8.s, p2/M, z8.s, z25.s\n"
+      "uzp1 z24.h, z13.h, z14.h\n"
+      "uzp1 z23.b, z23.b, z24.b\n"
+      "smax z9.s, p2/M, z9.s, z25.s\n"
+      "smax z10.s, p2/M, z10.s, z25.s\n"
       "uzp1 z8.h, z8.h, z9.h\n"
       "st1b { z23.b }, p1, [x11]\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z25.s\n"
+      "smax z15.s, p2/M, z15.s, z25.s\n"
+      "uzp1 z23.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z23.b\n"
+      "smax z20.s, p2/M, z20.s, z25.s\n"
+      "smax z21.s, p2/M, z21.s, z25.s\n"
       "uzp1 z15.h, z15.h, z20.h\n"
-      "st1b { z8.b }, p1, [x24]\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "st1b { z8.b }, p1, [x26]\n"
+      "smax z22.s, p2/M, z22.s, z25.s\n"
+      "smax z16.s, p2/M, z16.s, z25.s\n"
       "uzp1 z20.h, z21.h, z22.h\n"
       "uzp1 z15.b, z15.b, z20.b\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z25.s\n"
+      "smax z18.s, p2/M, z18.s, z25.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "st1b { z15.b }, p1, [x23]\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "st1b { z15.b }, p1, [x25]\n"
+      "smax z19.s, p2/M, z19.s, z25.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x22]\n"
+      "st1b { z16.b }, p1, [x24]\n"
       "addvl x11, x11, #1\n"
       "52:"  // Height 4: Writeback done
       "decw x10, ALL, MUL #4\n"
@@ -1382,15 +1382,15 @@
       "56:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 57f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 58f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1401,204 +1401,204 @@
       "b 58f\n"
       "57:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "58:"  // Height 5: input setup done
       "cmp x27, #0x10\n"
       "ble 60f\n"
       "59:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1rqb { z5.b }, p0/Z, [x22]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1rqb { z6.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z7.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z5.d, z6.d, z1.d\n"
+      "trn2 z6.d, z6.d, z1.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "trn1 z3.d, z7.d, z2.d\n"
+      "trn2 z7.d, z7.d, z2.d\n"
+      "ld1b { z1.b }, p2/Z, [x9]\n"
+      "trn1 z2.d, z4.d, z0.d\n"
+      "trn2 z4.d, z4.d, z0.d\n"
+      "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x450198a8  // smmla z8.s, z5.b, z1.b\n"
+      ".inst 0x45019870  // smmla z16.s, z3.b, z1.b\n"
+      ".inst 0x45019858  // smmla z24.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      ".inst 0x450098ac  // smmla z12.s, z5.b, z0.b\n"
+      ".inst 0x45009874  // smmla z20.s, z3.b, z0.b\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x4500985c  // smmla z28.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x450198a9  // smmla z9.s, z5.b, z1.b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45019871  // smmla z17.s, z3.b, z1.b\n"
+      ".inst 0x45019859  // smmla z25.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x450098ad  // smmla z13.s, z5.b, z0.b\n"
+      ".inst 0x45009875  // smmla z21.s, z3.b, z0.b\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x4500985d  // smmla z29.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x450198aa  // smmla z10.s, z5.b, z1.b\n"
+      ".inst 0x45019872  // smmla z18.s, z3.b, z1.b\n"
+      ".inst 0x4501985a  // smmla z26.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x450098ae  // smmla z14.s, z5.b, z0.b\n"
+      ".inst 0x45009876  // smmla z22.s, z3.b, z0.b\n"
+      ".inst 0x4500985e  // smmla z30.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #16\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
-      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
-      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      ".inst 0x450198ab  // smmla z11.s, z5.b, z1.b\n"
+      ".inst 0x45019873  // smmla z19.s, z3.b, z1.b\n"
+      ".inst 0x4501985b  // smmla z27.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x450098af  // smmla z15.s, z5.b, z0.b\n"
+      ".inst 0x45009877  // smmla z23.s, z3.b, z0.b\n"
+      ".inst 0x4500985f  // smmla z31.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x450198c8  // smmla z8.s, z6.b, z1.b\n"
+      ".inst 0x450198f0  // smmla z16.s, z7.b, z1.b\n"
+      ".inst 0x45019898  // smmla z24.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x450098cc  // smmla z12.s, z6.b, z0.b\n"
+      ".inst 0x450098f4  // smmla z20.s, z7.b, z0.b\n"
+      ".inst 0x4500989c  // smmla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x450198c9  // smmla z9.s, z6.b, z1.b\n"
+      ".inst 0x450198f1  // smmla z17.s, z7.b, z1.b\n"
+      ".inst 0x45019899  // smmla z25.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x450098cd  // smmla z13.s, z6.b, z0.b\n"
+      ".inst 0x450098f5  // smmla z21.s, z7.b, z0.b\n"
+      ".inst 0x4500989d  // smmla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x450198ca  // smmla z10.s, z6.b, z1.b\n"
+      ".inst 0x450198f2  // smmla z18.s, z7.b, z1.b\n"
+      ".inst 0x4501989a  // smmla z26.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x450098ce  // smmla z14.s, z6.b, z0.b\n"
+      ".inst 0x450098f6  // smmla z22.s, z7.b, z0.b\n"
+      ".inst 0x4500989e  // smmla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x450198cb  // smmla z11.s, z6.b, z1.b\n"
+      ".inst 0x450198f3  // smmla z19.s, z7.b, z1.b\n"
+      ".inst 0x4501989b  // smmla z27.s, z4.b, z1.b\n"
+      ".inst 0x450098cf  // smmla z15.s, z6.b, z0.b\n"
+      ".inst 0x450098f7  // smmla z23.s, z7.b, z0.b\n"
+      ".inst 0x4500989f  // smmla z31.s, z4.b, z0.b\n"
       "bgt 59b\n"
       "60:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
+      "ld1rqb { z4.b }, p0/Z, [x25]\n"
       "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z7.d, z1.d, z4.d\n"
+      "trn2 z1.d, z1.d, z4.d\n"
       "ld1rqb { z5.b }, p0/Z, [x22]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "ld1b { z2.b }, p2/Z, [x9]\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x450298e8  // smmla z8.s, z7.b, z2.b\n"
+      ".inst 0x450298d0  // smmla z16.s, z6.b, z2.b\n"
+      ".inst 0x45029898  // smmla z24.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
-      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
-      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x450098ec  // smmla z12.s, z7.b, z0.b\n"
+      ".inst 0x450098d4  // smmla z20.s, z6.b, z0.b\n"
+      ".inst 0x4500989c  // smmla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x450298e9  // smmla z9.s, z7.b, z2.b\n"
+      ".inst 0x450298d1  // smmla z17.s, z6.b, z2.b\n"
+      ".inst 0x45029899  // smmla z25.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x450098ed  // smmla z13.s, z7.b, z0.b\n"
+      ".inst 0x450098d5  // smmla z21.s, z6.b, z0.b\n"
+      ".inst 0x4500989d  // smmla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x450298ea  // smmla z10.s, z7.b, z2.b\n"
+      ".inst 0x450298d2  // smmla z18.s, z6.b, z2.b\n"
+      ".inst 0x4502989a  // smmla z26.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x450098ee  // smmla z14.s, z7.b, z0.b\n"
+      ".inst 0x450098d6  // smmla z22.s, z6.b, z0.b\n"
+      ".inst 0x4500989e  // smmla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x450298eb  // smmla z11.s, z7.b, z2.b\n"
       "addvl x9, x9, #8\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
-      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
+      ".inst 0x450298d3  // smmla z19.s, z6.b, z2.b\n"
+      ".inst 0x4502989b  // smmla z27.s, z4.b, z2.b\n"
+      ".inst 0x450098ef  // smmla z15.s, z7.b, z0.b\n"
+      ".inst 0x450098d7  // smmla z23.s, z6.b, z0.b\n"
+      ".inst 0x4500989f  // smmla z31.s, z4.b, z0.b\n"
       "ble 61f\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "ld1b { z2.b }, p2/Z, [x9]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45029828  // smmla z8.s, z1.b, z2.b\n"
+      ".inst 0x45029870  // smmla z16.s, z3.b, z2.b\n"
+      ".inst 0x450298b8  // smmla z24.s, z5.b, z2.b\n"
+      ".inst 0x4500982c  // smmla z12.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45009874  // smmla z20.s, z3.b, z0.b\n"
+      ".inst 0x450098bc  // smmla z28.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45029829  // smmla z9.s, z1.b, z2.b\n"
+      ".inst 0x45029871  // smmla z17.s, z3.b, z2.b\n"
+      ".inst 0x450298b9  // smmla z25.s, z5.b, z2.b\n"
+      ".inst 0x4500982d  // smmla z13.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45009875  // smmla z21.s, z3.b, z0.b\n"
+      ".inst 0x450098bd  // smmla z29.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4502982a  // smmla z10.s, z1.b, z2.b\n"
+      ".inst 0x45029872  // smmla z18.s, z3.b, z2.b\n"
+      ".inst 0x450298ba  // smmla z26.s, z5.b, z2.b\n"
+      ".inst 0x4500982e  // smmla z14.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45009876  // smmla z22.s, z3.b, z0.b\n"
+      ".inst 0x450098be  // smmla z30.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #8\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
-      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      ".inst 0x4502982b  // smmla z11.s, z1.b, z2.b\n"
+      ".inst 0x45029873  // smmla z19.s, z3.b, z2.b\n"
+      ".inst 0x450298bb  // smmla z27.s, z5.b, z2.b\n"
+      ".inst 0x4500982f  // smmla z15.s, z1.b, z0.b\n"
+      ".inst 0x45009877  // smmla z23.s, z3.b, z0.b\n"
+      ".inst 0x450098bf  // smmla z31.s, z5.b, z0.b\n"
       "61:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 56b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
+      "uzp1 z4.d, z8.d, z12.d\n"
       "uzp2 z8.d, z8.d, z12.d\n"
-      "add x24, x11, x20\n"
+      "add x26, x11, x20\n"
       "uzp1 z12.d, z9.d, z13.d\n"
       "uzp2 z9.d, z9.d, z13.d\n"
-      "ld1w { z0.s }, p2/Z, [x14]\n"
-      "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x14]\n"
+      "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n"
       "uzp1 z13.d, z10.d, z14.d\n"
       "uzp2 z10.d, z10.d, z14.d\n"
-      "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n"
       "uzp1 z14.d, z11.d, z15.d\n"
       "uzp2 z11.d, z11.d, z15.d\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x20\n"
+      "add x24, x25, x20\n"
       "uzp1 z15.d, z16.d, z20.d\n"
       "uzp2 z16.d, z16.d, z20.d\n"
-      "add x21, x22, x20\n"
+      "add x23, x24, x20\n"
       "addvl x14, x14, #4\n"
       "uzp1 z20.d, z17.d, z21.d\n"
       "uzp2 z17.d, z17.d, z21.d\n"
@@ -1610,27 +1610,27 @@
       "uzp1 z25.d, z25.d, z29.d\n"
       "uzp1 z26.d, z26.d, z30.d\n"
       "uzp1 z27.d, z27.d, z31.d\n"
-      "mov z31.d, z7.d\n"
-      "add z31.s, z31.s, z0.s\n"
-      "add z12.s, z12.s, z1.s\n"
-      "add z13.s, z13.s, z2.s\n"
-      "add z14.s, z14.s, z3.s\n"
-      "add z8.s, z8.s, z0.s\n"
-      "add z9.s, z9.s, z1.s\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
-      "add z15.s, z15.s, z0.s\n"
-      "add z20.s, z20.s, z1.s\n"
-      "add z21.s, z21.s, z2.s\n"
-      "add z22.s, z22.s, z3.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
+      "mov z31.d, z4.d\n"
+      "add z31.s, z31.s, z3.s\n"
+      "add z12.s, z12.s, z2.s\n"
+      "add z13.s, z13.s, z1.s\n"
+      "add z14.s, z14.s, z0.s\n"
+      "add z8.s, z8.s, z3.s\n"
+      "add z9.s, z9.s, z2.s\n"
+      "add z10.s, z10.s, z1.s\n"
+      "add z11.s, z11.s, z0.s\n"
+      "add z15.s, z15.s, z3.s\n"
+      "add z20.s, z20.s, z2.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z0.s\n"
+      "add z16.s, z16.s, z3.s\n"
+      "add z17.s, z17.s, z2.s\n"
+      "add z18.s, z18.s, z1.s\n"
+      "add z19.s, z19.s, z0.s\n"
+      "add z24.s, z24.s, z3.s\n"
+      "add z25.s, z25.s, z2.s\n"
+      "add z26.s, z26.s, z1.s\n"
+      "add z27.s, z27.s, z0.s\n"
       "tbz %x[flags], #4, 62f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -1644,10 +1644,10 @@
       "addvl x13, x13, #4\n"
       "b 63f\n"
       "62:"  // Height 5: per layer parameters
-      "add x26, %x[qp], %[per_layer_right_shift]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z0.s }, p2/Z, [x26]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
       "mov z5.d, z4.d\n"
       "mov z2.d, z0.d\n"
@@ -1676,173 +1676,173 @@
       ".inst 0x04a6775a  // sqrdmulh z26.s, z26.s, z6.s\n"
       ".inst 0x04a7777b  // sqrdmulh z27.s, z27.s, z7.s\n"
       "tbz %x[flags], #5, 64f\n"
-      "and z4.d, z31.d, z0.d\n"
-      "and z5.d, z12.d, z1.d\n"
-      "and z6.d, z13.d, z2.d\n"
-      "and z7.d, z14.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z31.s, z31.s, z4.s\n"
-      "sqadd z12.s, z12.s, z5.s\n"
-      "sqadd z13.s, z13.s, z6.s\n"
-      "sqadd z14.s, z14.s, z7.s\n"
-      "and z4.d, z8.d, z0.d\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "sqadd z11.s, z11.s, z7.s\n"
-      "and z4.d, z15.d, z0.d\n"
-      "and z5.d, z20.d, z1.d\n"
-      "and z6.d, z21.d, z2.d\n"
-      "and z7.d, z22.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z15.s, z15.s, z4.s\n"
-      "sqadd z20.s, z20.s, z5.s\n"
-      "sqadd z21.s, z21.s, z6.s\n"
-      "sqadd z22.s, z22.s, z7.s\n"
-      "and z4.d, z16.d, z0.d\n"
-      "and z5.d, z17.d, z1.d\n"
-      "and z6.d, z18.d, z2.d\n"
-      "and z7.d, z19.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "and z4.d, z24.d, z0.d\n"
-      "and z5.d, z25.d, z1.d\n"
-      "and z6.d, z26.d, z2.d\n"
-      "and z7.d, z27.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z24.s, z24.s, z4.s\n"
-      "sqadd z25.s, z25.s, z5.s\n"
-      "sqadd z26.s, z26.s, z6.s\n"
-      "sqadd z27.s, z27.s, z7.s\n"
+      "and z30.d, z31.d, z0.d\n"
+      "and z29.d, z12.d, z1.d\n"
+      "and z28.d, z13.d, z2.d\n"
+      "and z23.d, z14.d, z3.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z30.s\n"
+      "sqadd z12.s, z12.s, z29.s\n"
+      "sqadd z13.s, z13.s, z28.s\n"
+      "sqadd z14.s, z14.s, z23.s\n"
+      "and z30.d, z8.d, z0.d\n"
+      "and z29.d, z9.d, z1.d\n"
+      "and z28.d, z10.d, z2.d\n"
+      "and z23.d, z11.d, z3.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z30.s\n"
+      "sqadd z9.s, z9.s, z29.s\n"
+      "sqadd z10.s, z10.s, z28.s\n"
+      "sqadd z11.s, z11.s, z23.s\n"
+      "and z30.d, z15.d, z0.d\n"
+      "and z29.d, z20.d, z1.d\n"
+      "and z28.d, z21.d, z2.d\n"
+      "and z23.d, z22.d, z3.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z30.s\n"
+      "sqadd z20.s, z20.s, z29.s\n"
+      "sqadd z21.s, z21.s, z28.s\n"
+      "sqadd z22.s, z22.s, z23.s\n"
+      "and z30.d, z16.d, z0.d\n"
+      "and z29.d, z17.d, z1.d\n"
+      "and z28.d, z18.d, z2.d\n"
+      "and z23.d, z19.d, z3.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z30.s\n"
+      "sqadd z17.s, z17.s, z29.s\n"
+      "sqadd z18.s, z18.s, z28.s\n"
+      "sqadd z19.s, z19.s, z23.s\n"
+      "and z30.d, z24.d, z0.d\n"
+      "and z29.d, z25.d, z1.d\n"
+      "and z28.d, z26.d, z2.d\n"
+      "and z23.d, z27.d, z3.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z30.s\n"
+      "sqadd z25.s, z25.s, z29.s\n"
+      "sqadd z26.s, z26.s, z28.s\n"
+      "sqadd z27.s, z27.s, z23.s\n"
       "64:"  // Height 5: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
       ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
-      "add z31.s, z31.s, z4.s\n"
+      "add z31.s, z31.s, z28.s\n"
       ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
       ".inst 0x4482884d  // srshl z13.s, p2/M, z13.s, z2.s\n"
-      "add z12.s, z12.s, z4.s\n"
-      "add z13.s, z13.s, z4.s\n"
+      "add z12.s, z12.s, z28.s\n"
+      "add z13.s, z13.s, z28.s\n"
       ".inst 0x4482886e  // srshl z14.s, p2/M, z14.s, z3.s\n"
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
-      "add z14.s, z14.s, z4.s\n"
-      "add z8.s, z8.s, z4.s\n"
+      "add z14.s, z14.s, z28.s\n"
+      "add z8.s, z8.s, z28.s\n"
       ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
       ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
-      "add z9.s, z9.s, z4.s\n"
-      "add z10.s, z10.s, z4.s\n"
+      "add z9.s, z9.s, z28.s\n"
+      "add z10.s, z10.s, z28.s\n"
       ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
       ".inst 0x4482880f  // srshl z15.s, p2/M, z15.s, z0.s\n"
-      "add z11.s, z11.s, z4.s\n"
-      "add z15.s, z15.s, z4.s\n"
+      "add z11.s, z11.s, z28.s\n"
+      "add z15.s, z15.s, z28.s\n"
       ".inst 0x44828834  // srshl z20.s, p2/M, z20.s, z1.s\n"
       ".inst 0x44828855  // srshl z21.s, p2/M, z21.s, z2.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "add z21.s, z21.s, z4.s\n"
+      "add z20.s, z20.s, z28.s\n"
+      "add z21.s, z21.s, z28.s\n"
       ".inst 0x44828876  // srshl z22.s, p2/M, z22.s, z3.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z22.s, z22.s, z28.s\n"
+      "add z16.s, z16.s, z28.s\n"
       ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
       ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z28.s\n"
+      "add z18.s, z18.s, z28.s\n"
       ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
+      "add z19.s, z19.s, z28.s\n"
+      "add z24.s, z24.s, z28.s\n"
       ".inst 0x44828839  // srshl z25.s, p2/M, z25.s, z1.s\n"
       ".inst 0x4482885a  // srshl z26.s, p2/M, z26.s, z2.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "add z26.s, z26.s, z4.s\n"
+      "add z25.s, z25.s, z28.s\n"
+      "add z26.s, z26.s, z28.s\n"
       ".inst 0x4482887b  // srshl z27.s, p2/M, z27.s, z3.s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x25]\n"
-      "add z27.s, z27.s, z4.s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x25]\n"
-      "smin z31.s, p2/M, z31.s, z6.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smax z31.s, p2/M, z31.s, z5.s\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z28.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      "smin z31.s, p2/M, z31.s, z23.s\n"
+      "smin z12.s, p2/M, z12.s, z23.s\n"
+      "smin z13.s, p2/M, z13.s, z23.s\n"
+      "smin z14.s, p2/M, z14.s, z23.s\n"
+      "smin z8.s, p2/M, z8.s, z23.s\n"
+      "smin z9.s, p2/M, z9.s, z23.s\n"
+      "smin z10.s, p2/M, z10.s, z23.s\n"
+      "smin z11.s, p2/M, z11.s, z23.s\n"
+      "smin z15.s, p2/M, z15.s, z23.s\n"
+      "smin z20.s, p2/M, z20.s, z23.s\n"
+      "smin z21.s, p2/M, z21.s, z23.s\n"
+      "smin z22.s, p2/M, z22.s, z23.s\n"
+      "smin z16.s, p2/M, z16.s, z23.s\n"
+      "smin z17.s, p2/M, z17.s, z23.s\n"
+      "smin z18.s, p2/M, z18.s, z23.s\n"
+      "smin z19.s, p2/M, z19.s, z23.s\n"
+      "smin z24.s, p2/M, z24.s, z23.s\n"
+      "smin z25.s, p2/M, z25.s, z23.s\n"
+      "smin z26.s, p2/M, z26.s, z23.s\n"
+      "smin z27.s, p2/M, z27.s, z23.s\n"
+      "smax z31.s, p2/M, z31.s, z28.s\n"
+      "smax z12.s, p2/M, z12.s, z28.s\n"
+      "smax z13.s, p2/M, z13.s, z28.s\n"
       "uzp1 z31.h, z31.h, z12.h\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "uzp1 z12.h, z13.h, z14.h\n"
-      "uzp1 z31.b, z31.b, z12.b\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z14.s, p2/M, z14.s, z28.s\n"
+      "smax z8.s, p2/M, z8.s, z28.s\n"
+      "uzp1 z23.h, z13.h, z14.h\n"
+      "uzp1 z31.b, z31.b, z23.b\n"
+      "smax z9.s, p2/M, z9.s, z28.s\n"
+      "smax z10.s, p2/M, z10.s, z28.s\n"
       "uzp1 z8.h, z8.h, z9.h\n"
       "st1b { z31.b }, p1, [x11]\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z28.s\n"
+      "smax z15.s, p2/M, z15.s, z28.s\n"
+      "uzp1 z23.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z23.b\n"
+      "smax z20.s, p2/M, z20.s, z28.s\n"
+      "smax z21.s, p2/M, z21.s, z28.s\n"
       "uzp1 z15.h, z15.h, z20.h\n"
-      "st1b { z8.b }, p1, [x24]\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "st1b { z8.b }, p1, [x26]\n"
+      "smax z22.s, p2/M, z22.s, z28.s\n"
+      "smax z16.s, p2/M, z16.s, z28.s\n"
       "uzp1 z20.h, z21.h, z22.h\n"
       "uzp1 z15.b, z15.b, z20.b\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z28.s\n"
+      "smax z18.s, p2/M, z18.s, z28.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "st1b { z15.b }, p1, [x23]\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "st1b { z15.b }, p1, [x25]\n"
+      "smax z19.s, p2/M, z19.s, z28.s\n"
+      "smax z24.s, p2/M, z24.s, z28.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z25.s, p2/M, z25.s, z28.s\n"
+      "smax z26.s, p2/M, z26.s, z28.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
-      "st1b { z16.b }, p1, [x22]\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "st1b { z24.b }, p1, [x21]\n"
+      "st1b { z16.b }, p1, [x24]\n"
+      "smax z27.s, p2/M, z27.s, z28.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
       "addvl x11, x11, #1\n"
       "65:"  // Height 5: Writeback done
       "decw x10, ALL, MUL #4\n"
@@ -1891,16 +1891,16 @@
       "69:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 70f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 71f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1912,209 +1912,209 @@
       "b 71f\n"
       "70:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "71:"  // Height 6: input setup done
       "cmp x27, #0x10\n"
       "ble 73f\n"
       "72:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1rqb { z5.b }, p0/Z, [x22]\n"
-      "ld1rqb { z6.b }, p0/Z, [x21]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1rqb { z7.b }, p0/Z, [x26]\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
+      "trn1 z6.d, z7.d, z0.d\n"
+      "ld1rqb { z5.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "trn2 z7.d, z7.d, z0.d\n"
+      "trn1 z4.d, z5.d, z1.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z0.b }, p0/Z, [x21]\n"
+      "trn2 z5.d, z5.d, z1.d\n"
+      "trn1 z2.d, z3.d, z0.d\n"
+      "trn2 z3.d, z3.d, z0.d\n"
+      "ld1b { z1.b }, p2/Z, [x9]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x450198c8  // smmla z8.s, z6.b, z1.b\n"
+      ".inst 0x45019890  // smmla z16.s, z4.b, z1.b\n"
+      ".inst 0x45019858  // smmla z24.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      ".inst 0x450098cc  // smmla z12.s, z6.b, z0.b\n"
+      ".inst 0x45009894  // smmla z20.s, z4.b, z0.b\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x4500985c  // smmla z28.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x450198c9  // smmla z9.s, z6.b, z1.b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45019891  // smmla z17.s, z4.b, z1.b\n"
+      ".inst 0x45019859  // smmla z25.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x450098cd  // smmla z13.s, z6.b, z0.b\n"
+      ".inst 0x45009895  // smmla z21.s, z4.b, z0.b\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x4500985d  // smmla z29.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x450198ca  // smmla z10.s, z6.b, z1.b\n"
       "add x21, x21, #0x10\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x45019892  // smmla z18.s, z4.b, z1.b\n"
+      ".inst 0x4501985a  // smmla z26.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x450098ce  // smmla z14.s, z6.b, z0.b\n"
+      ".inst 0x45009896  // smmla z22.s, z4.b, z0.b\n"
+      ".inst 0x4500985e  // smmla z30.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #16\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
-      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
-      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      ".inst 0x450198cb  // smmla z11.s, z6.b, z1.b\n"
+      ".inst 0x45019893  // smmla z19.s, z4.b, z1.b\n"
+      ".inst 0x4501985b  // smmla z27.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x450098cf  // smmla z15.s, z6.b, z0.b\n"
+      ".inst 0x45009897  // smmla z23.s, z4.b, z0.b\n"
+      ".inst 0x4500985f  // smmla z31.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x450198e8  // smmla z8.s, z7.b, z1.b\n"
+      ".inst 0x450198b0  // smmla z16.s, z5.b, z1.b\n"
+      ".inst 0x45019878  // smmla z24.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x450098ec  // smmla z12.s, z7.b, z0.b\n"
+      ".inst 0x450098b4  // smmla z20.s, z5.b, z0.b\n"
+      ".inst 0x4500987c  // smmla z28.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x450198e9  // smmla z9.s, z7.b, z1.b\n"
+      ".inst 0x450198b1  // smmla z17.s, z5.b, z1.b\n"
+      ".inst 0x45019879  // smmla z25.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x450098ed  // smmla z13.s, z7.b, z0.b\n"
+      ".inst 0x450098b5  // smmla z21.s, z5.b, z0.b\n"
+      ".inst 0x4500987d  // smmla z29.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x450198ea  // smmla z10.s, z7.b, z1.b\n"
+      ".inst 0x450198b2  // smmla z18.s, z5.b, z1.b\n"
+      ".inst 0x4501987a  // smmla z26.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x450098ee  // smmla z14.s, z7.b, z0.b\n"
+      ".inst 0x450098b6  // smmla z22.s, z5.b, z0.b\n"
+      ".inst 0x4500987e  // smmla z30.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x450198eb  // smmla z11.s, z7.b, z1.b\n"
+      ".inst 0x450198b3  // smmla z19.s, z5.b, z1.b\n"
+      ".inst 0x4501987b  // smmla z27.s, z3.b, z1.b\n"
+      ".inst 0x450098ef  // smmla z15.s, z7.b, z0.b\n"
+      ".inst 0x450098b7  // smmla z23.s, z5.b, z0.b\n"
+      ".inst 0x4500987f  // smmla z31.s, z3.b, z0.b\n"
       "bgt 72b\n"
       "73:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
+      "trn1 z7.d, z1.d, z0.d\n"
       "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z0.d\n"
+      "trn1 z6.d, z3.d, z2.d\n"
       "ld1rqb { z5.b }, p0/Z, [x22]\n"
-      "ld1rqb { z6.b }, p0/Z, [x21]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x21]\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1b { z2.b }, p2/Z, [x9]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x450298e8  // smmla z8.s, z7.b, z2.b\n"
+      ".inst 0x450298d0  // smmla z16.s, z6.b, z2.b\n"
+      ".inst 0x45029898  // smmla z24.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
-      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
-      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x450098ec  // smmla z12.s, z7.b, z0.b\n"
+      ".inst 0x450098d4  // smmla z20.s, z6.b, z0.b\n"
+      ".inst 0x4500989c  // smmla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x450298e9  // smmla z9.s, z7.b, z2.b\n"
+      ".inst 0x450298d1  // smmla z17.s, z6.b, z2.b\n"
+      ".inst 0x45029899  // smmla z25.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x450098ed  // smmla z13.s, z7.b, z0.b\n"
+      ".inst 0x450098d5  // smmla z21.s, z6.b, z0.b\n"
+      ".inst 0x4500989d  // smmla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x450298ea  // smmla z10.s, z7.b, z2.b\n"
+      ".inst 0x450298d2  // smmla z18.s, z6.b, z2.b\n"
+      ".inst 0x4502989a  // smmla z26.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x450098ee  // smmla z14.s, z7.b, z0.b\n"
+      ".inst 0x450098d6  // smmla z22.s, z6.b, z0.b\n"
+      ".inst 0x4500989e  // smmla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x450298eb  // smmla z11.s, z7.b, z2.b\n"
       "addvl x9, x9, #8\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
-      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
+      ".inst 0x450298d3  // smmla z19.s, z6.b, z2.b\n"
+      ".inst 0x4502989b  // smmla z27.s, z4.b, z2.b\n"
+      ".inst 0x450098ef  // smmla z15.s, z7.b, z0.b\n"
+      ".inst 0x450098d7  // smmla z23.s, z6.b, z0.b\n"
+      ".inst 0x4500989f  // smmla z31.s, z4.b, z0.b\n"
       "ble 74f\n"
-      "ld1b { z7.b }, p2/Z, [x9]\n"
-      "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n"
+      "ld1b { z2.b }, p2/Z, [x9]\n"
+      "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45029828  // smmla z8.s, z1.b, z2.b\n"
+      ".inst 0x45029870  // smmla z16.s, z3.b, z2.b\n"
+      ".inst 0x450298b8  // smmla z24.s, z5.b, z2.b\n"
+      ".inst 0x4500982c  // smmla z12.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45009874  // smmla z20.s, z3.b, z0.b\n"
+      ".inst 0x450098bc  // smmla z28.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45029829  // smmla z9.s, z1.b, z2.b\n"
+      ".inst 0x45029871  // smmla z17.s, z3.b, z2.b\n"
+      ".inst 0x450298b9  // smmla z25.s, z5.b, z2.b\n"
+      ".inst 0x4500982d  // smmla z13.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45009875  // smmla z21.s, z3.b, z0.b\n"
+      ".inst 0x450098bd  // smmla z29.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4502982a  // smmla z10.s, z1.b, z2.b\n"
+      ".inst 0x45029872  // smmla z18.s, z3.b, z2.b\n"
+      ".inst 0x450298ba  // smmla z26.s, z5.b, z2.b\n"
+      ".inst 0x4500982e  // smmla z14.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45009876  // smmla z22.s, z3.b, z0.b\n"
+      ".inst 0x450098be  // smmla z30.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n"
       "addvl x9, x9, #8\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
-      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      ".inst 0x4502982b  // smmla z11.s, z1.b, z2.b\n"
+      ".inst 0x45029873  // smmla z19.s, z3.b, z2.b\n"
+      ".inst 0x450298bb  // smmla z27.s, z5.b, z2.b\n"
+      ".inst 0x4500982f  // smmla z15.s, z1.b, z0.b\n"
+      ".inst 0x45009877  // smmla z23.s, z3.b, z0.b\n"
+      ".inst 0x450098bf  // smmla z31.s, z5.b, z0.b\n"
       "74:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 69b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
-      "add x24, x11, x20\n"
+      "uzp1 z4.d, z8.d, z12.d\n"
+      "add x26, x11, x20\n"
       "uzp2 z8.d, z8.d, z12.d\n"
       "uzp1 z12.d, z9.d, z13.d\n"
       "uzp2 z9.d, z9.d, z13.d\n"
-      "add x23, x24, x20\n"
-      "ld1w { z0.s }, p2/Z, [x14]\n"
+      "add x25, x26, x20\n"
+      "ld1w { z3.s }, p2/Z, [x14]\n"
       "uzp1 z13.d, z10.d, z14.d\n"
       "uzp2 z10.d, z10.d, z14.d\n"
-      "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n"
       "uzp1 z14.d, z11.d, z15.d\n"
       "uzp2 z11.d, z11.d, z15.d\n"
-      "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n"
-      "add x22, x23, x20\n"
+      "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n"
+      "add x24, x25, x20\n"
       "uzp1 z15.d, z16.d, z20.d\n"
       "uzp2 z16.d, z16.d, z20.d\n"
-      "add x21, x22, x20\n"
-      "add x20, x21, x20\n"
+      "add x23, x24, x20\n"
+      "add x22, x23, x20\n"
       "uzp1 z20.d, z17.d, z21.d\n"
       "uzp2 z17.d, z17.d, z21.d\n"
       "addvl x14, x14, #4\n"
@@ -2130,31 +2130,31 @@
       "uzp2 z26.d, z26.d, z30.d\n"
       "uzp1 z30.d, z27.d, z31.d\n"
       "uzp2 z27.d, z27.d, z31.d\n"
-      "mov z31.d, z7.d\n"
-      "add z31.s, z31.s, z0.s\n"
-      "add z12.s, z12.s, z1.s\n"
-      "add z13.s, z13.s, z2.s\n"
-      "add z14.s, z14.s, z3.s\n"
-      "add z8.s, z8.s, z0.s\n"
-      "add z9.s, z9.s, z1.s\n"
-      "add z10.s, z10.s, z2.s\n"
-      "add z11.s, z11.s, z3.s\n"
-      "add z15.s, z15.s, z0.s\n"
-      "add z20.s, z20.s, z1.s\n"
-      "add z21.s, z21.s, z2.s\n"
-      "add z22.s, z22.s, z3.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z23.s, z23.s, z0.s\n"
-      "add z28.s, z28.s, z1.s\n"
-      "add z29.s, z29.s, z2.s\n"
-      "add z30.s, z30.s, z3.s\n"
-      "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
+      "mov z31.d, z4.d\n"
+      "add z31.s, z31.s, z3.s\n"
+      "add z12.s, z12.s, z2.s\n"
+      "add z13.s, z13.s, z1.s\n"
+      "add z14.s, z14.s, z0.s\n"
+      "add z8.s, z8.s, z3.s\n"
+      "add z9.s, z9.s, z2.s\n"
+      "add z10.s, z10.s, z1.s\n"
+      "add z11.s, z11.s, z0.s\n"
+      "add z15.s, z15.s, z3.s\n"
+      "add z20.s, z20.s, z2.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z0.s\n"
+      "add z16.s, z16.s, z3.s\n"
+      "add z17.s, z17.s, z2.s\n"
+      "add z18.s, z18.s, z1.s\n"
+      "add z19.s, z19.s, z0.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "add z28.s, z28.s, z2.s\n"
+      "add z29.s, z29.s, z1.s\n"
+      "add z30.s, z30.s, z0.s\n"
+      "add z24.s, z24.s, z3.s\n"
+      "add z25.s, z25.s, z2.s\n"
+      "add z26.s, z26.s, z1.s\n"
+      "add z27.s, z27.s, z0.s\n"
       "tbz %x[flags], #4, 75f\n"
       "ld1w { z0.s }, p2/Z, [x12]\n"
       "ld1w { z4.s }, p2/Z, [x13]\n"
@@ -2168,10 +2168,10 @@
       "addvl x13, x13, #4\n"
       "b 76f\n"
       "75:"  // Height 6: per layer parameters
-      "add x26, %x[qp], %[per_layer_right_shift]\n"
-      "add x25, %x[qp], %[per_layer_mul]\n"
-      "ld1rw { z0.s }, p2/Z, [x26]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x21, %x[qp], %[per_layer_right_shift]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z0.s }, p2/Z, [x21]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       "mov z1.d, z0.d\n"
       "mov z5.d, z4.d\n"
       "mov z2.d, z0.d\n"
@@ -2204,81 +2204,81 @@
       ".inst 0x04a6775a  // sqrdmulh z26.s, z26.s, z6.s\n"
       ".inst 0x04a7777b  // sqrdmulh z27.s, z27.s, z7.s\n"
       "tbz %x[flags], #5, 77f\n"
-      "and z4.d, z31.d, z0.d\n"
-      "and z5.d, z12.d, z1.d\n"
-      "and z6.d, z13.d, z2.d\n"
-      "and z7.d, z14.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
+      "and z7.d, z31.d, z0.d\n"
+      "and z6.d, z12.d, z1.d\n"
+      "and z5.d, z13.d, z2.d\n"
+      "and z4.d, z14.d, z3.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z31.s, z31.s, z4.s\n"
-      "sqadd z12.s, z12.s, z5.s\n"
-      "sqadd z13.s, z13.s, z6.s\n"
-      "sqadd z14.s, z14.s, z7.s\n"
-      "and z4.d, z8.d, z0.d\n"
-      "and z5.d, z9.d, z1.d\n"
-      "and z6.d, z10.d, z2.d\n"
-      "and z7.d, z11.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z8.s, z8.s, z4.s\n"
-      "sqadd z9.s, z9.s, z5.s\n"
-      "sqadd z10.s, z10.s, z6.s\n"
-      "sqadd z11.s, z11.s, z7.s\n"
-      "and z4.d, z15.d, z0.d\n"
-      "and z5.d, z20.d, z1.d\n"
-      "and z6.d, z21.d, z2.d\n"
-      "and z7.d, z22.d, z3.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z15.s, z15.s, z4.s\n"
-      "sqadd z20.s, z20.s, z5.s\n"
-      "sqadd z21.s, z21.s, z6.s\n"
-      "sqadd z22.s, z22.s, z7.s\n"
-      "and z4.d, z16.d, z0.d\n"
-      "and z5.d, z17.d, z1.d\n"
-      "and z6.d, z18.d, z2.d\n"
-      "and z7.d, z19.d, z3.d\n"
       "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z7.s\n"
+      "sqadd z12.s, z12.s, z6.s\n"
+      "sqadd z13.s, z13.s, z5.s\n"
+      "sqadd z14.s, z14.s, z4.s\n"
+      "and z7.d, z8.d, z0.d\n"
+      "and z6.d, z9.d, z1.d\n"
+      "and z5.d, z10.d, z2.d\n"
+      "and z4.d, z11.d, z3.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "and z4.d, z23.d, z0.d\n"
-      "and z5.d, z28.d, z1.d\n"
-      "and z6.d, z29.d, z2.d\n"
-      "and z7.d, z30.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z7.s\n"
+      "sqadd z9.s, z9.s, z6.s\n"
+      "sqadd z10.s, z10.s, z5.s\n"
+      "sqadd z11.s, z11.s, z4.s\n"
+      "and z7.d, z15.d, z0.d\n"
+      "and z6.d, z20.d, z1.d\n"
+      "and z5.d, z21.d, z2.d\n"
+      "and z4.d, z22.d, z3.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "sqadd z28.s, z28.s, z5.s\n"
-      "sqadd z29.s, z29.s, z6.s\n"
-      "sqadd z30.s, z30.s, z7.s\n"
-      "and z4.d, z24.d, z0.d\n"
-      "and z5.d, z25.d, z1.d\n"
-      "and z6.d, z26.d, z2.d\n"
-      "and z7.d, z27.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z7.s\n"
+      "sqadd z20.s, z20.s, z6.s\n"
+      "sqadd z21.s, z21.s, z5.s\n"
+      "sqadd z22.s, z22.s, z4.s\n"
+      "and z7.d, z16.d, z0.d\n"
+      "and z6.d, z17.d, z1.d\n"
+      "and z5.d, z18.d, z2.d\n"
+      "and z4.d, z19.d, z3.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z24.s, z24.s, z4.s\n"
-      "sqadd z25.s, z25.s, z5.s\n"
-      "sqadd z26.s, z26.s, z6.s\n"
-      "sqadd z27.s, z27.s, z7.s\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z7.s\n"
+      "sqadd z17.s, z17.s, z6.s\n"
+      "sqadd z18.s, z18.s, z5.s\n"
+      "sqadd z19.s, z19.s, z4.s\n"
+      "and z7.d, z23.d, z0.d\n"
+      "and z6.d, z28.d, z1.d\n"
+      "and z5.d, z29.d, z2.d\n"
+      "and z4.d, z30.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z7.s\n"
+      "sqadd z28.s, z28.s, z6.s\n"
+      "sqadd z29.s, z29.s, z5.s\n"
+      "sqadd z30.s, z30.s, z4.s\n"
+      "and z7.d, z24.d, z0.d\n"
+      "and z6.d, z25.d, z1.d\n"
+      "and z5.d, z26.d, z2.d\n"
+      "and z4.d, z27.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z7.s\n"
+      "sqadd z25.s, z25.s, z6.s\n"
+      "sqadd z26.s, z26.s, z5.s\n"
+      "sqadd z27.s, z27.s, z4.s\n"
       "77:"  // Height 6: no shift correction
-      "add x25, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x25]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x20]\n"
       ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
       "add z31.s, z31.s, z4.s\n"
       ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
@@ -2326,83 +2326,83 @@
       "add z25.s, z25.s, z4.s\n"
       "add z26.s, z26.s, z4.s\n"
       ".inst 0x4482887b  // srshl z27.s, p2/M, z27.s, z3.s\n"
-      "add x25, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x25]\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
       "add z27.s, z27.s, z4.s\n"
-      "add x25, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x25]\n"
-      "smin z31.s, p2/M, z31.s, z6.s\n"
-      "smin z12.s, p2/M, z12.s, z6.s\n"
-      "smin z13.s, p2/M, z13.s, z6.s\n"
-      "smin z14.s, p2/M, z14.s, z6.s\n"
-      "smin z8.s, p2/M, z8.s, z6.s\n"
-      "smin z9.s, p2/M, z9.s, z6.s\n"
-      "smin z10.s, p2/M, z10.s, z6.s\n"
-      "smin z11.s, p2/M, z11.s, z6.s\n"
-      "smin z15.s, p2/M, z15.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z28.s, p2/M, z28.s, z6.s\n"
-      "smin z29.s, p2/M, z29.s, z6.s\n"
-      "smin z30.s, p2/M, z30.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smax z31.s, p2/M, z31.s, z5.s\n"
-      "smax z12.s, p2/M, z12.s, z5.s\n"
-      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "smin z31.s, p2/M, z31.s, z0.s\n"
+      "smin z12.s, p2/M, z12.s, z0.s\n"
+      "smin z13.s, p2/M, z13.s, z0.s\n"
+      "smin z14.s, p2/M, z14.s, z0.s\n"
+      "smin z8.s, p2/M, z8.s, z0.s\n"
+      "smin z9.s, p2/M, z9.s, z0.s\n"
+      "smin z10.s, p2/M, z10.s, z0.s\n"
+      "smin z11.s, p2/M, z11.s, z0.s\n"
+      "smin z15.s, p2/M, z15.s, z0.s\n"
+      "smin z20.s, p2/M, z20.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z0.s\n"
+      "smin z22.s, p2/M, z22.s, z0.s\n"
+      "smin z16.s, p2/M, z16.s, z0.s\n"
+      "smin z17.s, p2/M, z17.s, z0.s\n"
+      "smin z18.s, p2/M, z18.s, z0.s\n"
+      "smin z19.s, p2/M, z19.s, z0.s\n"
+      "smin z23.s, p2/M, z23.s, z0.s\n"
+      "smin z28.s, p2/M, z28.s, z0.s\n"
+      "smin z29.s, p2/M, z29.s, z0.s\n"
+      "smin z30.s, p2/M, z30.s, z0.s\n"
+      "smin z24.s, p2/M, z24.s, z0.s\n"
+      "smin z25.s, p2/M, z25.s, z0.s\n"
+      "smin z26.s, p2/M, z26.s, z0.s\n"
+      "smin z27.s, p2/M, z27.s, z0.s\n"
+      "smax z31.s, p2/M, z31.s, z1.s\n"
+      "smax z12.s, p2/M, z12.s, z1.s\n"
+      "smax z13.s, p2/M, z13.s, z1.s\n"
       "uzp1 z31.h, z31.h, z12.h\n"
-      "smax z14.s, p2/M, z14.s, z5.s\n"
-      "smax z8.s, p2/M, z8.s, z5.s\n"
-      "uzp1 z12.h, z13.h, z14.h\n"
-      "uzp1 z31.b, z31.b, z12.b\n"
-      "smax z9.s, p2/M, z9.s, z5.s\n"
-      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z14.s, p2/M, z14.s, z1.s\n"
+      "smax z8.s, p2/M, z8.s, z1.s\n"
+      "uzp1 z0.h, z13.h, z14.h\n"
+      "uzp1 z31.b, z31.b, z0.b\n"
+      "smax z9.s, p2/M, z9.s, z1.s\n"
+      "smax z10.s, p2/M, z10.s, z1.s\n"
       "uzp1 z8.h, z8.h, z9.h\n"
       "st1b { z31.b }, p1, [x11]\n"
-      "smax z11.s, p2/M, z11.s, z5.s\n"
-      "smax z15.s, p2/M, z15.s, z5.s\n"
-      "uzp1 z9.h, z10.h, z11.h\n"
-      "uzp1 z8.b, z8.b, z9.b\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z1.s\n"
+      "smax z15.s, p2/M, z15.s, z1.s\n"
+      "uzp1 z31.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z31.b\n"
+      "smax z20.s, p2/M, z20.s, z1.s\n"
+      "smax z21.s, p2/M, z21.s, z1.s\n"
       "uzp1 z15.h, z15.h, z20.h\n"
-      "st1b { z8.b }, p1, [x24]\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "st1b { z8.b }, p1, [x26]\n"
+      "smax z22.s, p2/M, z22.s, z1.s\n"
+      "smax z16.s, p2/M, z16.s, z1.s\n"
       "uzp1 z20.h, z21.h, z22.h\n"
       "uzp1 z15.b, z15.b, z20.b\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z1.s\n"
+      "smax z18.s, p2/M, z18.s, z1.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "st1b { z15.b }, p1, [x23]\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "st1b { z15.b }, p1, [x25]\n"
+      "smax z19.s, p2/M, z19.s, z1.s\n"
+      "smax z23.s, p2/M, z23.s, z1.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z28.s, p2/M, z28.s, z5.s\n"
-      "smax z29.s, p2/M, z29.s, z5.s\n"
+      "smax z28.s, p2/M, z28.s, z1.s\n"
+      "smax z29.s, p2/M, z29.s, z1.s\n"
       "uzp1 z23.h, z23.h, z28.h\n"
-      "st1b { z16.b }, p1, [x22]\n"
-      "smax z30.s, p2/M, z30.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "uzp1 z28.h, z29.h, z30.h\n"
-      "uzp1 z23.b, z23.b, z28.b\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "st1b { z16.b }, p1, [x24]\n"
+      "smax z30.s, p2/M, z30.s, z1.s\n"
+      "smax z24.s, p2/M, z24.s, z1.s\n"
+      "uzp1 z16.h, z29.h, z30.h\n"
+      "uzp1 z23.b, z23.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z1.s\n"
+      "smax z26.s, p2/M, z26.s, z1.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
-      "st1b { z23.b }, p1, [x21]\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "st1b { z24.b }, p1, [x20]\n"
+      "st1b { z23.b }, p1, [x23]\n"
+      "smax z27.s, p2/M, z27.s, z1.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x22]\n"
       "addvl x11, x11, #1\n"
       "78:"  // Height 6: Writeback done
       "decw x10, ALL, MUL #4\n"
@@ -2420,7 +2420,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "80:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -2428,4 +2427,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
index 28057aa..cfa349f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -39,6 +39,7 @@
 {
 // Actual kernel implementations
 void sve_hybrid_s8s32_dot_6x4VL( ARGLIST );
+void sve_hybrid_s8s32_dot_6x4VL_a64fx( ARGLIST );
 
 class cls_sve_hybrid_s8s32_dot_6x4VL
 {
@@ -74,7 +75,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, int32_t>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -83,10 +83,11 @@
                     return { 20.92 };
                 case CPUModel::V1:
                     return { 62.24 };
+                case CPUModel::A64FX:
+                    return { 94.32 };
             }
         }
 
-
         if (std::is_same<T, int8_t>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -95,6 +96,8 @@
                     return { 22.77, 3.90, 0.47 };
                 case CPUModel::V1:
                     return { 48.09, 16.24, 0.83 };
+                case CPUModel::A64FX:
+                    return { 100.19, 3.13, 0.43 };
             }
         }
 
@@ -103,13 +106,19 @@
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_s8s32_dot_6x4VL;
-    cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *)
+    cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_hybrid_s8s32_dot_6x4VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp
index 51e9aa1..1a48321 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp
@@ -115,11 +115,11 @@
       "5:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 6f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 7f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -135,12 +135,12 @@
       "8:"  // Height 1: Multiply loop: Main loop
       "sdot z8.s, z6.b, z0.b\n"
       "sdot z9.s, z7.b, z0.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
       "add x26, x26, #0x4\n"
-      "sdot z10.s, z6.b, z0.b\n"
-      "sdot z11.s, z7.b, z0.b\n"
+      "sdot z10.s, z17.b, z0.b\n"
+      "sdot z11.s, z16.b, z0.b\n"
       "subs x27, x27, #0x4\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1b { z6.b }, p4/Z, [x10]\n"
@@ -150,12 +150,12 @@
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "sdot z8.s, z6.b, z0.b\n"
       "sdot z9.s, z7.b, z0.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
-      "sdot z10.s, z6.b, z0.b\n"
-      "sdot z11.s, z7.b, z0.b\n"
+      "sdot z10.s, z17.b, z0.b\n"
+      "sdot z11.s, z16.b, z0.b\n"
       "addvl x10, x10, #4\n"
       "bne 5b\n"
       "st1w { z8.s }, p3, [x9]\n"
@@ -183,15 +183,15 @@
       "whilelt p0.s, x20, x11\n"
       "tbz %x[flags], #0, 13f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
       "ld1w { z8.s }, p3/Z, [x9]\n"
       "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x24]\n"
-      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x20]\n"
+      "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 14f\n"
       "13:"  // Height 2: no accumulate
       "mov z8.s, #0x0\n"
@@ -207,12 +207,12 @@
       "15:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 16f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 17f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -220,7 +220,7 @@
       "b 17f\n"
       "16:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
+      "add x25, x26, x21\n"
       "17:"  // Height 2: input setup done
       "subs x27, x27, #0x4\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -231,18 +231,18 @@
       "18:"  // Height 2: Multiply loop: Main loop
       "sdot z8.s, z6.b, z0.b\n"
       "sdot z12.s, z6.b, z1.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
       "add x26, x26, #0x4\n"
       "sdot z9.s, z7.b, z0.b\n"
       "sdot z13.s, z7.b, z1.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
       "subs x27, x27, #0x4\n"
       "add x25, x25, #0x4\n"
-      "sdot z10.s, z6.b, z0.b\n"
-      "sdot z14.s, z6.b, z1.b\n"
-      "sdot z11.s, z7.b, z0.b\n"
-      "sdot z15.s, z7.b, z1.b\n"
+      "sdot z10.s, z17.b, z0.b\n"
+      "sdot z14.s, z17.b, z1.b\n"
+      "sdot z11.s, z16.b, z0.b\n"
+      "sdot z15.s, z16.b, z1.b\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
       "ld1b { z6.b }, p4/Z, [x10]\n"
@@ -252,29 +252,29 @@
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "sdot z8.s, z6.b, z0.b\n"
       "sdot z12.s, z6.b, z1.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
       "sdot z9.s, z7.b, z0.b\n"
       "sdot z13.s, z7.b, z1.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
-      "sdot z10.s, z6.b, z0.b\n"
-      "sdot z14.s, z6.b, z1.b\n"
+      "sdot z10.s, z17.b, z0.b\n"
+      "sdot z14.s, z17.b, z1.b\n"
       "addvl x10, x10, #4\n"
-      "sdot z11.s, z7.b, z0.b\n"
-      "sdot z15.s, z7.b, z1.b\n"
+      "sdot z11.s, z16.b, z0.b\n"
+      "sdot z15.s, z16.b, z1.b\n"
       "bne 15b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
       "st1w { z8.s }, p3, [x9]\n"
       "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p3, [x24]\n"
-      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
+      "st1w { z12.s }, p3, [x20]\n"
+      "st1w { z13.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x20, #3, MUL VL]\n"
       "20:"  // Height 2: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -295,20 +295,20 @@
       "whilelt p0.s, x20, x11\n"
       "tbz %x[flags], #0, 23f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z8.s }, p3/Z, [x9]\n"
       "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x24]\n"
-      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p3/Z, [x23]\n"
-      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x21]\n"
+      "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x20]\n"
+      "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 24f\n"
       "23:"  // Height 3: no accumulate
       "mov z8.s, #0x0\n"
@@ -328,13 +328,13 @@
       "25:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 26f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 27f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -343,8 +343,8 @@
       "b 27f\n"
       "26:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "27:"  // Height 3: input setup done
       "subs x27, x27, #0x4\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -360,21 +360,21 @@
       "subs x27, x27, #0x4\n"
       "sdot z16.s, z6.b, z2.b\n"
       "sdot z9.s, z7.b, z0.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x4\n"
       "sdot z13.s, z7.b, z1.b\n"
       "sdot z17.s, z7.b, z2.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
       "add x24, x24, #0x4\n"
-      "sdot z10.s, z6.b, z0.b\n"
-      "sdot z14.s, z6.b, z1.b\n"
-      "sdot z18.s, z6.b, z2.b\n"
-      "sdot z11.s, z7.b, z0.b\n"
+      "sdot z10.s, z21.b, z0.b\n"
+      "sdot z14.s, z21.b, z1.b\n"
+      "sdot z18.s, z21.b, z2.b\n"
+      "sdot z11.s, z20.b, z0.b\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1b { z6.b }, p4/Z, [x10]\n"
-      "sdot z15.s, z7.b, z1.b\n"
-      "sdot z19.s, z7.b, z2.b\n"
+      "sdot z15.s, z20.b, z1.b\n"
+      "sdot z19.s, z20.b, z2.b\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
       "ld1rw { z2.s }, p4/Z, [x24]\n"
       "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -386,35 +386,35 @@
       "add x28, x28, #0x1\n"
       "sdot z16.s, z6.b, z2.b\n"
       "sdot z9.s, z7.b, z0.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
       "cmp x28, x20\n"
       "sdot z13.s, z7.b, z1.b\n"
       "sdot z17.s, z7.b, z2.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b\n"
-      "sdot z14.s, z6.b, z1.b\n"
-      "sdot z18.s, z6.b, z2.b\n"
-      "sdot z11.s, z7.b, z0.b\n"
-      "sdot z15.s, z7.b, z1.b\n"
-      "sdot z19.s, z7.b, z2.b\n"
+      "sdot z10.s, z21.b, z0.b\n"
+      "sdot z14.s, z21.b, z1.b\n"
+      "sdot z18.s, z21.b, z2.b\n"
+      "sdot z11.s, z20.b, z0.b\n"
+      "sdot z15.s, z20.b, z1.b\n"
+      "sdot z19.s, z20.b, z2.b\n"
       "bne 25b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "st1w { z8.s }, p3, [x9]\n"
       "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p3, [x24]\n"
-      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p3, [x23]\n"
-      "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z12.s }, p3, [x21]\n"
+      "st1w { z13.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x20]\n"
+      "st1w { z17.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x20, #3, MUL VL]\n"
       "30:"  // Height 3: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -435,25 +435,25 @@
       "whilelt p0.s, x20, x11\n"
       "tbz %x[flags], #0, 33f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z8.s }, p3/Z, [x9]\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x24]\n"
-      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p3/Z, [x23]\n"
-      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p3/Z, [x22]\n"
-      "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x22]\n"
+      "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x21]\n"
+      "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x20]\n"
+      "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 34f\n"
       "33:"  // Height 4: no accumulate
       "mov z8.s, #0x0\n"
@@ -477,14 +477,14 @@
       "35:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 36f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 37f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -494,9 +494,9 @@
       "b 37f\n"
       "36:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "37:"  // Height 4: input setup done
       "subs x27, x27, #0x4\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -513,7 +513,7 @@
       "subs x27, x27, #0x4\n"
       "sdot z16.s, z6.b, z2.b\n"
       "sdot z20.s, z6.b, z3.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x4\n"
       "sdot z9.s, z7.b, z0.b\n"
       "sdot z13.s, z7.b, z1.b\n"
@@ -521,19 +521,19 @@
       "add x23, x23, #0x4\n"
       "sdot z17.s, z7.b, z2.b\n"
       "sdot z21.s, z7.b, z3.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b\n"
-      "sdot z14.s, z6.b, z1.b\n"
-      "sdot z18.s, z6.b, z2.b\n"
-      "sdot z22.s, z6.b, z3.b\n"
+      "sdot z10.s, z25.b, z0.b\n"
+      "sdot z14.s, z25.b, z1.b\n"
+      "sdot z18.s, z25.b, z2.b\n"
+      "sdot z22.s, z25.b, z3.b\n"
       "ld1b { z6.b }, p4/Z, [x10]\n"
-      "sdot z11.s, z7.b, z0.b\n"
-      "sdot z15.s, z7.b, z1.b\n"
+      "sdot z11.s, z24.b, z0.b\n"
+      "sdot z15.s, z24.b, z1.b\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
-      "sdot z19.s, z7.b, z2.b\n"
-      "sdot z23.s, z7.b, z3.b\n"
+      "sdot z19.s, z24.b, z2.b\n"
+      "sdot z23.s, z24.b, z3.b\n"
       "ld1rw { z2.s }, p4/Z, [x24]\n"
       "ld1rw { z3.s }, p4/Z, [x23]\n"
       "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -545,44 +545,44 @@
       "add x28, x28, #0x1\n"
       "sdot z16.s, z6.b, z2.b\n"
       "sdot z20.s, z6.b, z3.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
       "cmp x28, x20\n"
       "sdot z9.s, z7.b, z0.b\n"
       "sdot z13.s, z7.b, z1.b\n"
       "sdot z17.s, z7.b, z2.b\n"
       "sdot z21.s, z7.b, z3.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b\n"
-      "sdot z14.s, z6.b, z1.b\n"
-      "sdot z18.s, z6.b, z2.b\n"
-      "sdot z22.s, z6.b, z3.b\n"
-      "sdot z11.s, z7.b, z0.b\n"
-      "sdot z15.s, z7.b, z1.b\n"
-      "sdot z19.s, z7.b, z2.b\n"
-      "sdot z23.s, z7.b, z3.b\n"
+      "sdot z10.s, z25.b, z0.b\n"
+      "sdot z14.s, z25.b, z1.b\n"
+      "sdot z18.s, z25.b, z2.b\n"
+      "sdot z22.s, z25.b, z3.b\n"
+      "sdot z11.s, z24.b, z0.b\n"
+      "sdot z15.s, z24.b, z1.b\n"
+      "sdot z19.s, z24.b, z2.b\n"
+      "sdot z23.s, z24.b, z3.b\n"
       "bne 35b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "st1w { z8.s }, p3, [x9]\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p3, [x24]\n"
-      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p3, [x23]\n"
-      "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
-      "st1w { z20.s }, p3, [x22]\n"
-      "st1w { z21.s }, p2, [x22, #1, MUL VL]\n"
-      "st1w { z22.s }, p1, [x22, #2, MUL VL]\n"
-      "st1w { z23.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z12.s }, p3, [x22]\n"
+      "st1w { z13.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x21]\n"
+      "st1w { z17.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x20]\n"
+      "st1w { z21.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x20, #3, MUL VL]\n"
       "40:"  // Height 4: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -603,30 +603,30 @@
       "whilelt p0.s, x20, x11\n"
       "tbz %x[flags], #0, 43f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
       "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x24]\n"
-      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p3/Z, [x23]\n"
-      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p3/Z, [x22]\n"
-      "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z24.s }, p3/Z, [x21]\n"
-      "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x20]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 44f\n"
       "43:"  // Height 5: no accumulate
       "mov z8.s, #0x0\n"
@@ -654,15 +654,15 @@
       "45:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 46f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -673,10 +673,10 @@
       "b 47f\n"
       "46:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "47:"  // Height 5: input setup done
       "subs x27, x27, #0x4\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -698,29 +698,29 @@
       "add x24, x24, #0x4\n"
       "sdot z24.s, z6.b, z4.b\n"
       "sdot z9.s, z7.b, z0.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
       "add x23, x23, #0x4\n"
       "sdot z13.s, z7.b, z1.b\n"
       "sdot z17.s, z7.b, z2.b\n"
       "add x22, x22, #0x4\n"
       "sdot z21.s, z7.b, z3.b\n"
       "sdot z25.s, z7.b, z4.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b\n"
-      "sdot z14.s, z6.b, z1.b\n"
-      "sdot z18.s, z6.b, z2.b\n"
-      "sdot z22.s, z6.b, z3.b\n"
-      "sdot z26.s, z6.b, z4.b\n"
-      "sdot z11.s, z7.b, z0.b\n"
+      "sdot z10.s, z29.b, z0.b\n"
+      "sdot z14.s, z29.b, z1.b\n"
+      "sdot z18.s, z29.b, z2.b\n"
+      "sdot z22.s, z29.b, z3.b\n"
+      "sdot z26.s, z29.b, z4.b\n"
+      "sdot z11.s, z28.b, z0.b\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1b { z6.b }, p4/Z, [x10]\n"
-      "sdot z15.s, z7.b, z1.b\n"
-      "sdot z19.s, z7.b, z2.b\n"
+      "sdot z15.s, z28.b, z1.b\n"
+      "sdot z19.s, z28.b, z2.b\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
       "ld1rw { z2.s }, p4/Z, [x24]\n"
-      "sdot z23.s, z7.b, z3.b\n"
-      "sdot z27.s, z7.b, z4.b\n"
+      "sdot z23.s, z28.b, z3.b\n"
+      "sdot z27.s, z28.b, z4.b\n"
       "ld1rw { z3.s }, p4/Z, [x23]\n"
       "ld1rw { z4.s }, p4/Z, [x22]\n"
       "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -735,50 +735,50 @@
       "cmp x28, x20\n"
       "sdot z24.s, z6.b, z4.b\n"
       "sdot z9.s, z7.b, z0.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
       "sdot z13.s, z7.b, z1.b\n"
       "sdot z17.s, z7.b, z2.b\n"
       "sdot z21.s, z7.b, z3.b\n"
       "sdot z25.s, z7.b, z4.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b\n"
-      "sdot z14.s, z6.b, z1.b\n"
-      "sdot z18.s, z6.b, z2.b\n"
-      "sdot z22.s, z6.b, z3.b\n"
-      "sdot z26.s, z6.b, z4.b\n"
-      "sdot z11.s, z7.b, z0.b\n"
-      "sdot z15.s, z7.b, z1.b\n"
-      "sdot z19.s, z7.b, z2.b\n"
-      "sdot z23.s, z7.b, z3.b\n"
-      "sdot z27.s, z7.b, z4.b\n"
+      "sdot z10.s, z29.b, z0.b\n"
+      "sdot z14.s, z29.b, z1.b\n"
+      "sdot z18.s, z29.b, z2.b\n"
+      "sdot z22.s, z29.b, z3.b\n"
+      "sdot z26.s, z29.b, z4.b\n"
+      "sdot z11.s, z28.b, z0.b\n"
+      "sdot z15.s, z28.b, z1.b\n"
+      "sdot z19.s, z28.b, z2.b\n"
+      "sdot z23.s, z28.b, z3.b\n"
+      "sdot z27.s, z28.b, z4.b\n"
       "bne 45b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "st1w { z8.s }, p3, [x9]\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "st1w { z8.s }, p3, [x9]\n"
       "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p3, [x24]\n"
-      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p3, [x23]\n"
-      "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
-      "st1w { z20.s }, p3, [x22]\n"
-      "st1w { z21.s }, p2, [x22, #1, MUL VL]\n"
-      "st1w { z22.s }, p1, [x22, #2, MUL VL]\n"
-      "st1w { z23.s }, p0, [x22, #3, MUL VL]\n"
-      "st1w { z24.s }, p3, [x21]\n"
-      "st1w { z25.s }, p2, [x21, #1, MUL VL]\n"
-      "st1w { z26.s }, p1, [x21, #2, MUL VL]\n"
-      "st1w { z27.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z12.s }, p3, [x23]\n"
+      "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x22]\n"
+      "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x21]\n"
+      "st1w { z21.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x20]\n"
+      "st1w { z25.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x20, #3, MUL VL]\n"
       "50:"  // Height 5: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -862,16 +862,16 @@
       "55:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 56f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 57f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -883,11 +883,11 @@
       "b 57f\n"
       "56:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "57:"  // Height 6: input setup done
       "subs x27, x27, #0x4\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -1022,7 +1022,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "62:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1030,4 +1029,4 @@
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
index b3d2e6b..eeef192 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
@@ -115,11 +115,11 @@
       "5:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 6f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 7f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -132,87 +132,87 @@
       "8:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10]\n"
+      "sdot z8.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z9.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z10.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z11.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "sdot z8.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sdot z9.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "sdot z10.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sdot z11.s, z16.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[2]\n"
+      "sdot z9.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[2]\n"
+      "sdot z11.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[3]\n"
+      "sdot z9.s, z16.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
       "sub x27, x27, #0x10\n"
       "cmp x27, #0x10\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z10.s, z17.b, z0.b[3]\n"
+      "sdot z11.s, z16.b, z0.b[3]\n"
       "add x26, x26, #0x10\n"
       "bgt 8b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10]\n"
+      "sdot z8.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z9.s, z16.b, z0.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[0]\n"
+      "sdot z11.s, z16.b, z0.b[0]\n"
       "addvl x10, x10, #4\n"
       "ble 10f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[1]\n"
+      "sdot z9.s, z16.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z10.s, z17.b, z0.b[1]\n"
+      "sdot z11.s, z16.b, z0.b[1]\n"
       "addvl x10, x10, #4\n"
       "ble 10f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[2]\n"
+      "sdot z9.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z10.s, z17.b, z0.b[2]\n"
+      "sdot z11.s, z16.b, z0.b[2]\n"
       "addvl x10, x10, #4\n"
       "ble 10f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[3]\n"
+      "sdot z9.s, z16.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[3]\n"
+      "sdot z11.s, z16.b, z0.b[3]\n"
       "addvl x10, x10, #4\n"
       "10:"  // Height 1: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -244,15 +244,15 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 14f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x9]\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 15f\n"
       "14:"  // Height 2: no accumulate
       "mov z8.s, #0x0\n"
@@ -268,12 +268,12 @@
       "16:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 17f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 18f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -281,146 +281,146 @@
       "b 18f\n"
       "17:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
+      "add x25, x26, x21\n"
       "18:"  // Height 2: input setup done
       "cmp x27, #0x10\n"
       "ble 20f\n"
       "19:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z1.b[0]\n"
+      "sdot z12.s, z17.b, z0.b[0]\n"
+      "sdot z9.s, z16.b, z1.b[0]\n"
+      "sdot z13.s, z16.b, z0.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z1.b[0]\n"
+      "sdot z14.s, z17.b, z0.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
       "cmp x27, #0x10\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sdot z11.s, z16.b, z1.b[0]\n"
+      "sdot z15.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "sdot z8.s, z17.b, z1.b[1]\n"
+      "sdot z12.s, z17.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "sdot z9.s, z16.b, z1.b[1]\n"
+      "sdot z13.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z10.s, z17.b, z1.b[1]\n"
+      "sdot z14.s, z17.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "sdot z11.s, z16.b, z1.b[1]\n"
+      "sdot z15.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "sdot z8.s, z17.b, z1.b[2]\n"
+      "sdot z12.s, z17.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "sdot z9.s, z16.b, z1.b[2]\n"
+      "sdot z13.s, z16.b, z0.b[2]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "sdot z10.s, z17.b, z1.b[2]\n"
+      "sdot z14.s, z17.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "sdot z11.s, z16.b, z1.b[2]\n"
+      "sdot z15.s, z16.b, z0.b[2]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "sdot z8.s, z17.b, z1.b[3]\n"
+      "sdot z12.s, z17.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "sdot z9.s, z16.b, z1.b[3]\n"
+      "sdot z13.s, z16.b, z0.b[3]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sdot z10.s, z17.b, z1.b[3]\n"
+      "sdot z14.s, z17.b, z0.b[3]\n"
+      "sdot z11.s, z16.b, z1.b[3]\n"
+      "sdot z15.s, z16.b, z0.b[3]\n"
       "bgt 19b\n"
       "20:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z0.b }, p0/Z, [x26]\n"
       "ld1rqb { z1.b }, p0/Z, [x25]\n"
       "subs x27, x27, #0x4\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[0]\n"
+      "sdot z12.s, z17.b, z1.b[0]\n"
+      "sdot z9.s, z16.b, z0.b[0]\n"
+      "sdot z13.s, z16.b, z1.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[0]\n"
+      "sdot z14.s, z17.b, z1.b[0]\n"
       "addvl x10, x10, #4\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z11.s, z16.b, z0.b[0]\n"
+      "sdot z15.s, z16.b, z1.b[0]\n"
       "ble 21f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[1]\n"
+      "sdot z12.s, z17.b, z1.b[1]\n"
+      "sdot z9.s, z16.b, z0.b[1]\n"
+      "sdot z13.s, z16.b, z1.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z10.s, z17.b, z0.b[1]\n"
+      "sdot z14.s, z17.b, z1.b[1]\n"
       "addvl x10, x10, #4\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z11.s, z16.b, z0.b[1]\n"
+      "sdot z15.s, z16.b, z1.b[1]\n"
       "ble 21f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[2]\n"
+      "sdot z12.s, z17.b, z1.b[2]\n"
+      "sdot z9.s, z16.b, z0.b[2]\n"
+      "sdot z13.s, z16.b, z1.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z10.s, z17.b, z0.b[2]\n"
+      "sdot z14.s, z17.b, z1.b[2]\n"
       "addvl x10, x10, #4\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z11.s, z16.b, z0.b[2]\n"
+      "sdot z15.s, z16.b, z1.b[2]\n"
       "ble 21f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z17.b, z0.b[3]\n"
+      "sdot z12.s, z17.b, z1.b[3]\n"
+      "sdot z9.s, z16.b, z0.b[3]\n"
+      "sdot z13.s, z16.b, z1.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z17.b, z0.b[3]\n"
+      "sdot z14.s, z17.b, z1.b[3]\n"
       "addvl x10, x10, #4\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z11.s, z16.b, z0.b[3]\n"
+      "sdot z15.s, z16.b, z1.b[3]\n"
       "21:"  // Height 2: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 16b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z12.s }, p4, [x20]\n"
+      "st1w { z13.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x20, #3, MUL VL]\n"
       "22:"  // Height 2: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -441,20 +441,20 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 25f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x9]\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23]\n"
-      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x20]\n"
+      "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 26f\n"
       "25:"  // Height 3: no accumulate
       "mov z8.s, #0x0\n"
@@ -474,13 +474,13 @@
       "27:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 28f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 29f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -489,86 +489,86 @@
       "b 29f\n"
       "28:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "29:"  // Height 3: input setup done
       "cmp x27, #0x10\n"
       "ble 31f\n"
       "30:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
       "ld1rqb { z1.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
-      "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "sdot z8.s, z21.b, z2.b[0]\n"
+      "sdot z12.s, z21.b, z1.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z16.s, z21.b, z0.b[0]\n"
+      "sdot z9.s, z20.b, z2.b[0]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[0]\n"
+      "sdot z17.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
       "cmp x27, #0x10\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z10.s, z21.b, z2.b[0]\n"
+      "sdot z14.s, z21.b, z1.b[0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "sdot z18.s, z21.b, z0.b[0]\n"
+      "sdot z11.s, z20.b, z2.b[0]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "sdot z15.s, z20.b, z1.b[0]\n"
+      "sdot z19.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sdot z8.s, z21.b, z2.b[1]\n"
+      "sdot z12.s, z21.b, z1.b[1]\n"
+      "sdot z16.s, z21.b, z0.b[1]\n"
+      "sdot z9.s, z20.b, z2.b[1]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[1]\n"
+      "sdot z17.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z10.s, z21.b, z2.b[1]\n"
+      "sdot z14.s, z21.b, z1.b[1]\n"
+      "sdot z18.s, z21.b, z0.b[1]\n"
+      "sdot z11.s, z20.b, z2.b[1]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "sdot z15.s, z20.b, z1.b[1]\n"
+      "sdot z19.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "sdot z8.s, z21.b, z2.b[2]\n"
+      "sdot z12.s, z21.b, z1.b[2]\n"
+      "sdot z16.s, z21.b, z0.b[2]\n"
+      "sdot z9.s, z20.b, z2.b[2]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[2]\n"
+      "sdot z17.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "sdot z10.s, z21.b, z2.b[2]\n"
+      "sdot z14.s, z21.b, z1.b[2]\n"
+      "sdot z18.s, z21.b, z0.b[2]\n"
+      "sdot z11.s, z20.b, z2.b[2]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "sdot z15.s, z20.b, z1.b[2]\n"
+      "sdot z19.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "sdot z8.s, z21.b, z2.b[3]\n"
+      "sdot z12.s, z21.b, z1.b[3]\n"
+      "sdot z16.s, z21.b, z0.b[3]\n"
+      "sdot z9.s, z20.b, z2.b[3]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[3]\n"
+      "sdot z17.s, z20.b, z0.b[3]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sdot z10.s, z21.b, z2.b[3]\n"
+      "sdot z14.s, z21.b, z1.b[3]\n"
+      "sdot z18.s, z21.b, z0.b[3]\n"
+      "sdot z11.s, z20.b, z2.b[3]\n"
+      "sdot z15.s, z20.b, z1.b[3]\n"
+      "sdot z19.s, z20.b, z0.b[3]\n"
       "bgt 30b\n"
       "31:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
@@ -576,100 +576,100 @@
       "ld1rqb { z1.b }, p0/Z, [x25]\n"
       "subs x27, x27, #0x4\n"
       "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "sdot z8.s, z21.b, z0.b[0]\n"
+      "sdot z12.s, z21.b, z1.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z16.s, z21.b, z2.b[0]\n"
+      "sdot z9.s, z20.b, z0.b[0]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[0]\n"
+      "sdot z17.s, z20.b, z2.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z10.s, z21.b, z0.b[0]\n"
+      "sdot z14.s, z21.b, z1.b[0]\n"
+      "sdot z18.s, z21.b, z2.b[0]\n"
+      "sdot z11.s, z20.b, z0.b[0]\n"
+      "sdot z15.s, z20.b, z1.b[0]\n"
+      "sdot z19.s, z20.b, z2.b[0]\n"
       "ble 32f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z21.b, z0.b[1]\n"
+      "sdot z12.s, z21.b, z1.b[1]\n"
+      "sdot z16.s, z21.b, z2.b[1]\n"
+      "sdot z9.s, z20.b, z0.b[1]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[1]\n"
+      "sdot z17.s, z20.b, z2.b[1]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z10.s, z21.b, z0.b[1]\n"
+      "sdot z14.s, z21.b, z1.b[1]\n"
+      "sdot z18.s, z21.b, z2.b[1]\n"
+      "sdot z11.s, z20.b, z0.b[1]\n"
+      "sdot z15.s, z20.b, z1.b[1]\n"
+      "sdot z19.s, z20.b, z2.b[1]\n"
       "ble 32f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z21.b, z0.b[2]\n"
+      "sdot z12.s, z21.b, z1.b[2]\n"
+      "sdot z16.s, z21.b, z2.b[2]\n"
+      "sdot z9.s, z20.b, z0.b[2]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[2]\n"
+      "sdot z17.s, z20.b, z2.b[2]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z10.s, z21.b, z0.b[2]\n"
+      "sdot z14.s, z21.b, z1.b[2]\n"
+      "sdot z18.s, z21.b, z2.b[2]\n"
+      "sdot z11.s, z20.b, z0.b[2]\n"
+      "sdot z15.s, z20.b, z1.b[2]\n"
+      "sdot z19.s, z20.b, z2.b[2]\n"
       "ble 32f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z21.b, z0.b[3]\n"
+      "sdot z12.s, z21.b, z1.b[3]\n"
+      "sdot z16.s, z21.b, z2.b[3]\n"
+      "sdot z9.s, z20.b, z0.b[3]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z20.b, z1.b[3]\n"
+      "sdot z17.s, z20.b, z2.b[3]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z10.s, z21.b, z0.b[3]\n"
+      "sdot z14.s, z21.b, z1.b[3]\n"
+      "sdot z18.s, z21.b, z2.b[3]\n"
+      "sdot z11.s, z20.b, z0.b[3]\n"
+      "sdot z15.s, z20.b, z1.b[3]\n"
+      "sdot z19.s, z20.b, z2.b[3]\n"
       "32:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 27b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z12.s }, p4, [x21]\n"
+      "st1w { z13.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x20]\n"
+      "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
       "33:"  // Height 3: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -690,25 +690,25 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 36f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x9]\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23]\n"
-      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x22]\n"
-      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x21]\n"
+      "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 37f\n"
       "36:"  // Height 4: no accumulate
       "mov z8.s, #0x0\n"
@@ -732,14 +732,14 @@
       "38:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 39f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 40f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -749,105 +749,105 @@
       "b 40f\n"
       "39:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "40:"  // Height 4: input setup done
       "cmp x27, #0x10\n"
       "ble 42f\n"
       "41:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z3.b }, p0/Z, [x26]\n"
+      "ld1rqb { z2.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
-      "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z0.b }, p0/Z, [x23]\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[0]\n"
+      "sdot z12.s, z25.b, z2.b[0]\n"
+      "sdot z16.s, z25.b, z1.b[0]\n"
+      "sdot z20.s, z25.b, z0.b[0]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
+      "sdot z9.s, z24.b, z3.b[0]\n"
+      "sdot z13.s, z24.b, z2.b[0]\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "sdot z17.s, z24.b, z1.b[0]\n"
+      "sdot z21.s, z24.b, z0.b[0]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z25.b, z3.b[0]\n"
+      "sdot z14.s, z25.b, z2.b[0]\n"
+      "sdot z18.s, z25.b, z1.b[0]\n"
+      "sdot z22.s, z25.b, z0.b[0]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "sdot z11.s, z24.b, z3.b[0]\n"
+      "sdot z15.s, z24.b, z2.b[0]\n"
+      "sdot z19.s, z24.b, z1.b[0]\n"
+      "sdot z23.s, z24.b, z0.b[0]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[1]\n"
+      "sdot z12.s, z25.b, z2.b[1]\n"
+      "sdot z16.s, z25.b, z1.b[1]\n"
+      "sdot z20.s, z25.b, z0.b[1]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "sdot z9.s, z24.b, z3.b[1]\n"
+      "sdot z13.s, z24.b, z2.b[1]\n"
+      "sdot z17.s, z24.b, z1.b[1]\n"
+      "sdot z21.s, z24.b, z0.b[1]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z10.s, z25.b, z3.b[1]\n"
+      "sdot z14.s, z25.b, z2.b[1]\n"
+      "sdot z18.s, z25.b, z1.b[1]\n"
+      "sdot z22.s, z25.b, z0.b[1]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "sdot z11.s, z24.b, z3.b[1]\n"
+      "sdot z15.s, z24.b, z2.b[1]\n"
+      "sdot z19.s, z24.b, z1.b[1]\n"
+      "sdot z23.s, z24.b, z0.b[1]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[2]\n"
+      "sdot z12.s, z25.b, z2.b[2]\n"
+      "sdot z16.s, z25.b, z1.b[2]\n"
+      "sdot z20.s, z25.b, z0.b[2]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "sdot z9.s, z24.b, z3.b[2]\n"
+      "sdot z13.s, z24.b, z2.b[2]\n"
+      "sdot z17.s, z24.b, z1.b[2]\n"
+      "sdot z21.s, z24.b, z0.b[2]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "sdot z10.s, z25.b, z3.b[2]\n"
+      "sdot z14.s, z25.b, z2.b[2]\n"
+      "sdot z18.s, z25.b, z1.b[2]\n"
+      "sdot z22.s, z25.b, z0.b[2]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "sdot z11.s, z24.b, z3.b[2]\n"
+      "sdot z15.s, z24.b, z2.b[2]\n"
+      "sdot z19.s, z24.b, z1.b[2]\n"
+      "sdot z23.s, z24.b, z0.b[2]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "sdot z8.s, z25.b, z3.b[3]\n"
+      "sdot z12.s, z25.b, z2.b[3]\n"
+      "sdot z16.s, z25.b, z1.b[3]\n"
+      "sdot z20.s, z25.b, z0.b[3]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "sdot z9.s, z24.b, z3.b[3]\n"
+      "sdot z13.s, z24.b, z2.b[3]\n"
+      "sdot z17.s, z24.b, z1.b[3]\n"
+      "sdot z21.s, z24.b, z0.b[3]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sdot z10.s, z25.b, z3.b[3]\n"
+      "sdot z14.s, z25.b, z2.b[3]\n"
+      "sdot z18.s, z25.b, z1.b[3]\n"
+      "sdot z22.s, z25.b, z0.b[3]\n"
+      "sdot z11.s, z24.b, z3.b[3]\n"
+      "sdot z15.s, z24.b, z2.b[3]\n"
+      "sdot z19.s, z24.b, z1.b[3]\n"
+      "sdot z23.s, z24.b, z0.b[3]\n"
       "bgt 41b\n"
       "42:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
@@ -856,121 +856,121 @@
       "subs x27, x27, #0x4\n"
       "ld1rqb { z2.b }, p0/Z, [x24]\n"
       "ld1rqb { z3.b }, p0/Z, [x23]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[0]\n"
+      "sdot z12.s, z25.b, z1.b[0]\n"
+      "sdot z16.s, z25.b, z2.b[0]\n"
+      "sdot z20.s, z25.b, z3.b[0]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z9.s, z24.b, z0.b[0]\n"
+      "sdot z13.s, z24.b, z1.b[0]\n"
+      "sdot z17.s, z24.b, z2.b[0]\n"
+      "sdot z21.s, z24.b, z3.b[0]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z10.s, z25.b, z0.b[0]\n"
+      "sdot z14.s, z25.b, z1.b[0]\n"
+      "sdot z18.s, z25.b, z2.b[0]\n"
+      "sdot z22.s, z25.b, z3.b[0]\n"
+      "sdot z11.s, z24.b, z0.b[0]\n"
+      "sdot z15.s, z24.b, z1.b[0]\n"
+      "sdot z19.s, z24.b, z2.b[0]\n"
+      "sdot z23.s, z24.b, z3.b[0]\n"
       "ble 43f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[1]\n"
+      "sdot z12.s, z25.b, z1.b[1]\n"
+      "sdot z16.s, z25.b, z2.b[1]\n"
+      "sdot z20.s, z25.b, z3.b[1]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z9.s, z24.b, z0.b[1]\n"
+      "sdot z13.s, z24.b, z1.b[1]\n"
+      "sdot z17.s, z24.b, z2.b[1]\n"
+      "sdot z21.s, z24.b, z3.b[1]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z10.s, z25.b, z0.b[1]\n"
+      "sdot z14.s, z25.b, z1.b[1]\n"
+      "sdot z18.s, z25.b, z2.b[1]\n"
+      "sdot z22.s, z25.b, z3.b[1]\n"
+      "sdot z11.s, z24.b, z0.b[1]\n"
+      "sdot z15.s, z24.b, z1.b[1]\n"
+      "sdot z19.s, z24.b, z2.b[1]\n"
+      "sdot z23.s, z24.b, z3.b[1]\n"
       "ble 43f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[2]\n"
+      "sdot z12.s, z25.b, z1.b[2]\n"
+      "sdot z16.s, z25.b, z2.b[2]\n"
+      "sdot z20.s, z25.b, z3.b[2]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z9.s, z24.b, z0.b[2]\n"
+      "sdot z13.s, z24.b, z1.b[2]\n"
+      "sdot z17.s, z24.b, z2.b[2]\n"
+      "sdot z21.s, z24.b, z3.b[2]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z10.s, z25.b, z0.b[2]\n"
+      "sdot z14.s, z25.b, z1.b[2]\n"
+      "sdot z18.s, z25.b, z2.b[2]\n"
+      "sdot z22.s, z25.b, z3.b[2]\n"
+      "sdot z11.s, z24.b, z0.b[2]\n"
+      "sdot z15.s, z24.b, z1.b[2]\n"
+      "sdot z19.s, z24.b, z2.b[2]\n"
+      "sdot z23.s, z24.b, z3.b[2]\n"
       "ble 43f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z25.b, z0.b[3]\n"
+      "sdot z12.s, z25.b, z1.b[3]\n"
+      "sdot z16.s, z25.b, z2.b[3]\n"
+      "sdot z20.s, z25.b, z3.b[3]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z9.s, z24.b, z0.b[3]\n"
+      "sdot z13.s, z24.b, z1.b[3]\n"
+      "sdot z17.s, z24.b, z2.b[3]\n"
+      "sdot z21.s, z24.b, z3.b[3]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z10.s, z25.b, z0.b[3]\n"
+      "sdot z14.s, z25.b, z1.b[3]\n"
+      "sdot z18.s, z25.b, z2.b[3]\n"
+      "sdot z22.s, z25.b, z3.b[3]\n"
+      "sdot z11.s, z24.b, z0.b[3]\n"
+      "sdot z15.s, z24.b, z1.b[3]\n"
+      "sdot z19.s, z24.b, z2.b[3]\n"
+      "sdot z23.s, z24.b, z3.b[3]\n"
       "43:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 38b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "st1w { z8.s }, p4, [x9]\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z20.s }, p4, [x22]\n"
-      "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z12.s }, p4, [x22]\n"
+      "st1w { z13.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x21]\n"
+      "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x20]\n"
+      "st1w { z21.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x20, #3, MUL VL]\n"
       "44:"  // Height 4: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -991,30 +991,30 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
       "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23]\n"
-      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x22]\n"
-      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z24.s }, p4/Z, [x21]\n"
-      "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x22]\n"
+      "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x20]\n"
+      "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 48f\n"
       "47:"  // Height 5: no accumulate
       "mov z8.s, #0x0\n"
@@ -1042,15 +1042,15 @@
       "49:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 50f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 51f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1061,124 +1061,124 @@
       "b 51f\n"
       "50:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "51:"  // Height 5: input setup done
       "cmp x27, #0x10\n"
       "ble 53f\n"
       "52:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z4.b }, p0/Z, [x26]\n"
+      "ld1rqb { z3.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
       "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "ld1rqb { z4.b }, p0/Z, [x22]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
+      "ld1rqb { z0.b }, p0/Z, [x22]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "sdot z8.s, z29.b, z4.b[0]\n"
+      "sdot z12.s, z29.b, z3.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z16.s, z29.b, z2.b[0]\n"
+      "sdot z20.s, z29.b, z1.b[0]\n"
       "add x25, x25, #0x10\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z24.s, z29.b, z0.b[0]\n"
+      "sdot z9.s, z28.b, z4.b[0]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z13.s, z28.b, z3.b[0]\n"
+      "sdot z17.s, z28.b, z2.b[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "sdot z21.s, z28.b, z1.b[0]\n"
+      "sdot z25.s, z28.b, z0.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z29.b, z4.b[0]\n"
+      "sdot z14.s, z29.b, z3.b[0]\n"
+      "sdot z18.s, z29.b, z2.b[0]\n"
+      "sdot z22.s, z29.b, z1.b[0]\n"
+      "sdot z26.s, z29.b, z0.b[0]\n"
+      "sdot z11.s, z28.b, z4.b[0]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "sdot z15.s, z28.b, z3.b[0]\n"
+      "sdot z19.s, z28.b, z2.b[0]\n"
+      "sdot z23.s, z28.b, z1.b[0]\n"
+      "sdot z27.s, z28.b, z0.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sdot z8.s, z29.b, z4.b[1]\n"
+      "sdot z12.s, z29.b, z3.b[1]\n"
+      "sdot z16.s, z29.b, z2.b[1]\n"
+      "sdot z20.s, z29.b, z1.b[1]\n"
+      "sdot z24.s, z29.b, z0.b[1]\n"
+      "sdot z9.s, z28.b, z4.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "sdot z13.s, z28.b, z3.b[1]\n"
+      "sdot z17.s, z28.b, z2.b[1]\n"
+      "sdot z21.s, z28.b, z1.b[1]\n"
+      "sdot z25.s, z28.b, z0.b[1]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
+      "sdot z10.s, z29.b, z4.b[1]\n"
+      "sdot z14.s, z29.b, z3.b[1]\n"
+      "sdot z18.s, z29.b, z2.b[1]\n"
+      "sdot z22.s, z29.b, z1.b[1]\n"
+      "sdot z26.s, z29.b, z0.b[1]\n"
+      "sdot z11.s, z28.b, z4.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "sdot z15.s, z28.b, z3.b[1]\n"
+      "sdot z19.s, z28.b, z2.b[1]\n"
+      "sdot z23.s, z28.b, z1.b[1]\n"
+      "sdot z27.s, z28.b, z0.b[1]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "sdot z8.s, z29.b, z4.b[2]\n"
+      "sdot z12.s, z29.b, z3.b[2]\n"
+      "sdot z16.s, z29.b, z2.b[2]\n"
+      "sdot z20.s, z29.b, z1.b[2]\n"
+      "sdot z24.s, z29.b, z0.b[2]\n"
+      "sdot z9.s, z28.b, z4.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "sdot z13.s, z28.b, z3.b[2]\n"
+      "sdot z17.s, z28.b, z2.b[2]\n"
+      "sdot z21.s, z28.b, z1.b[2]\n"
+      "sdot z25.s, z28.b, z0.b[2]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "sdot z10.s, z29.b, z4.b[2]\n"
+      "sdot z14.s, z29.b, z3.b[2]\n"
+      "sdot z18.s, z29.b, z2.b[2]\n"
+      "sdot z22.s, z29.b, z1.b[2]\n"
+      "sdot z26.s, z29.b, z0.b[2]\n"
+      "sdot z11.s, z28.b, z4.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "sdot z15.s, z28.b, z3.b[2]\n"
+      "sdot z19.s, z28.b, z2.b[2]\n"
+      "sdot z23.s, z28.b, z1.b[2]\n"
+      "sdot z27.s, z28.b, z0.b[2]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "sdot z8.s, z29.b, z4.b[3]\n"
+      "sdot z12.s, z29.b, z3.b[3]\n"
+      "sdot z16.s, z29.b, z2.b[3]\n"
+      "sdot z20.s, z29.b, z1.b[3]\n"
+      "sdot z24.s, z29.b, z0.b[3]\n"
+      "sdot z9.s, z28.b, z4.b[3]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "sdot z13.s, z28.b, z3.b[3]\n"
+      "sdot z17.s, z28.b, z2.b[3]\n"
+      "sdot z21.s, z28.b, z1.b[3]\n"
+      "sdot z25.s, z28.b, z0.b[3]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sdot z10.s, z29.b, z4.b[3]\n"
+      "sdot z14.s, z29.b, z3.b[3]\n"
+      "sdot z18.s, z29.b, z2.b[3]\n"
+      "sdot z22.s, z29.b, z1.b[3]\n"
+      "sdot z26.s, z29.b, z0.b[3]\n"
+      "sdot z11.s, z28.b, z4.b[3]\n"
+      "sdot z15.s, z28.b, z3.b[3]\n"
+      "sdot z19.s, z28.b, z2.b[3]\n"
+      "sdot z23.s, z28.b, z1.b[3]\n"
+      "sdot z27.s, z28.b, z0.b[3]\n"
       "bgt 52b\n"
       "53:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
@@ -1188,142 +1188,142 @@
       "ld1rqb { z2.b }, p0/Z, [x24]\n"
       "ld1rqb { z3.b }, p0/Z, [x23]\n"
       "ld1rqb { z4.b }, p0/Z, [x22]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "sdot z8.s, z29.b, z0.b[0]\n"
+      "sdot z12.s, z29.b, z1.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z16.s, z29.b, z2.b[0]\n"
+      "sdot z20.s, z29.b, z3.b[0]\n"
+      "sdot z24.s, z29.b, z4.b[0]\n"
+      "sdot z9.s, z28.b, z0.b[0]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[0]\n"
+      "sdot z17.s, z28.b, z2.b[0]\n"
+      "sdot z21.s, z28.b, z3.b[0]\n"
+      "sdot z25.s, z28.b, z4.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
+      "sdot z10.s, z29.b, z0.b[0]\n"
+      "sdot z14.s, z29.b, z1.b[0]\n"
+      "sdot z18.s, z29.b, z2.b[0]\n"
+      "sdot z22.s, z29.b, z3.b[0]\n"
+      "sdot z26.s, z29.b, z4.b[0]\n"
+      "sdot z11.s, z28.b, z0.b[0]\n"
+      "sdot z15.s, z28.b, z1.b[0]\n"
+      "sdot z19.s, z28.b, z2.b[0]\n"
+      "sdot z23.s, z28.b, z3.b[0]\n"
+      "sdot z27.s, z28.b, z4.b[0]\n"
       "ble 54f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z29.b, z0.b[1]\n"
+      "sdot z12.s, z29.b, z1.b[1]\n"
+      "sdot z16.s, z29.b, z2.b[1]\n"
+      "sdot z20.s, z29.b, z3.b[1]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z24.s, z29.b, z4.b[1]\n"
+      "sdot z9.s, z28.b, z0.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[1]\n"
+      "sdot z17.s, z28.b, z2.b[1]\n"
+      "sdot z21.s, z28.b, z3.b[1]\n"
+      "sdot z25.s, z28.b, z4.b[1]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
+      "sdot z10.s, z29.b, z0.b[1]\n"
+      "sdot z14.s, z29.b, z1.b[1]\n"
+      "sdot z18.s, z29.b, z2.b[1]\n"
+      "sdot z22.s, z29.b, z3.b[1]\n"
+      "sdot z26.s, z29.b, z4.b[1]\n"
+      "sdot z11.s, z28.b, z0.b[1]\n"
+      "sdot z15.s, z28.b, z1.b[1]\n"
+      "sdot z19.s, z28.b, z2.b[1]\n"
+      "sdot z23.s, z28.b, z3.b[1]\n"
+      "sdot z27.s, z28.b, z4.b[1]\n"
       "ble 54f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z29.b, z0.b[2]\n"
+      "sdot z12.s, z29.b, z1.b[2]\n"
+      "sdot z16.s, z29.b, z2.b[2]\n"
+      "sdot z20.s, z29.b, z3.b[2]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z24.s, z29.b, z4.b[2]\n"
+      "sdot z9.s, z28.b, z0.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[2]\n"
+      "sdot z17.s, z28.b, z2.b[2]\n"
+      "sdot z21.s, z28.b, z3.b[2]\n"
+      "sdot z25.s, z28.b, z4.b[2]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
+      "sdot z10.s, z29.b, z0.b[2]\n"
+      "sdot z14.s, z29.b, z1.b[2]\n"
+      "sdot z18.s, z29.b, z2.b[2]\n"
+      "sdot z22.s, z29.b, z3.b[2]\n"
+      "sdot z26.s, z29.b, z4.b[2]\n"
+      "sdot z11.s, z28.b, z0.b[2]\n"
+      "sdot z15.s, z28.b, z1.b[2]\n"
+      "sdot z19.s, z28.b, z2.b[2]\n"
+      "sdot z23.s, z28.b, z3.b[2]\n"
+      "sdot z27.s, z28.b, z4.b[2]\n"
       "ble 54f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z29.b, z0.b[3]\n"
+      "sdot z12.s, z29.b, z1.b[3]\n"
+      "sdot z16.s, z29.b, z2.b[3]\n"
+      "sdot z20.s, z29.b, z3.b[3]\n"
+      "sdot z24.s, z29.b, z4.b[3]\n"
+      "sdot z9.s, z28.b, z0.b[3]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z13.s, z28.b, z1.b[3]\n"
+      "sdot z17.s, z28.b, z2.b[3]\n"
+      "sdot z21.s, z28.b, z3.b[3]\n"
+      "sdot z25.s, z28.b, z4.b[3]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
+      "sdot z10.s, z29.b, z0.b[3]\n"
+      "sdot z14.s, z29.b, z1.b[3]\n"
+      "sdot z18.s, z29.b, z2.b[3]\n"
+      "sdot z22.s, z29.b, z3.b[3]\n"
+      "sdot z26.s, z29.b, z4.b[3]\n"
+      "sdot z11.s, z28.b, z0.b[3]\n"
+      "sdot z15.s, z28.b, z1.b[3]\n"
+      "sdot z19.s, z28.b, z2.b[3]\n"
+      "sdot z23.s, z28.b, z3.b[3]\n"
+      "sdot z27.s, z28.b, z4.b[3]\n"
       "54:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 49b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "st1w { z8.s }, p4, [x9]\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "st1w { z8.s }, p4, [x9]\n"
       "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z20.s }, p4, [x22]\n"
-      "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
-      "st1w { z24.s }, p4, [x21]\n"
-      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
-      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z12.s }, p4, [x23]\n"
+      "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x22]\n"
+      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x21]\n"
+      "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x20]\n"
+      "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
       "55:"  // Height 5: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -1407,16 +1407,16 @@
       "60:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 61f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 62f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1428,143 +1428,143 @@
       "b 62f\n"
       "61:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "62:"  // Height 6: input setup done
       "cmp x27, #0x10\n"
       "ble 64f\n"
       "63:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z7.b }, p0/Z, [x26]\n"
+      "ld1rqb { z6.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
-      "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z5.b }, p0/Z, [x24]\n"
+      "ld1rqb { z4.b }, p0/Z, [x23]\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "ld1rqb { z4.b }, p0/Z, [x22]\n"
-      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z2.b }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z1.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[0]\n"
+      "sdot z12.s, z1.b, z6.b[0]\n"
+      "sdot z16.s, z1.b, z5.b[0]\n"
+      "sdot z20.s, z1.b, z4.b[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "sdot z28.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z24.s, z1.b, z3.b[0]\n"
+      "sdot z28.s, z1.b, z2.b[0]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "sdot z29.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "sdot z30.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
-      "sdot z31.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "sdot z28.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "sdot z29.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "sdot z9.s, z0.b, z7.b[0]\n"
+      "sdot z13.s, z0.b, z6.b[0]\n"
+      "sdot z17.s, z0.b, z5.b[0]\n"
+      "sdot z21.s, z0.b, z4.b[0]\n"
+      "sdot z25.s, z0.b, z3.b[0]\n"
+      "sdot z29.s, z0.b, z2.b[0]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z10.s, z1.b, z7.b[0]\n"
+      "sdot z14.s, z1.b, z6.b[0]\n"
+      "sdot z18.s, z1.b, z5.b[0]\n"
+      "sdot z22.s, z1.b, z4.b[0]\n"
+      "sdot z26.s, z1.b, z3.b[0]\n"
+      "sdot z30.s, z1.b, z2.b[0]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "sdot z11.s, z0.b, z7.b[0]\n"
+      "sdot z15.s, z0.b, z6.b[0]\n"
+      "sdot z19.s, z0.b, z5.b[0]\n"
+      "sdot z23.s, z0.b, z4.b[0]\n"
+      "sdot z27.s, z0.b, z3.b[0]\n"
+      "sdot z31.s, z0.b, z2.b[0]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[1]\n"
+      "sdot z12.s, z1.b, z6.b[1]\n"
+      "sdot z16.s, z1.b, z5.b[1]\n"
+      "sdot z20.s, z1.b, z4.b[1]\n"
+      "sdot z24.s, z1.b, z3.b[1]\n"
+      "sdot z28.s, z1.b, z2.b[1]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "sdot z9.s, z0.b, z7.b[1]\n"
+      "sdot z13.s, z0.b, z6.b[1]\n"
+      "sdot z17.s, z0.b, z5.b[1]\n"
+      "sdot z21.s, z0.b, z4.b[1]\n"
+      "sdot z25.s, z0.b, z3.b[1]\n"
+      "sdot z29.s, z0.b, z2.b[1]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "sdot z30.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
-      "sdot z31.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "sdot z28.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "sdot z29.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "sdot z30.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
-      "sdot z31.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "sdot z28.s, z6.b, z5.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "sdot z29.s, z7.b, z5.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z30.s, z6.b, z5.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
-      "sdot z31.s, z7.b, z5.b[3]\n"
+      "sdot z10.s, z1.b, z7.b[1]\n"
+      "sdot z14.s, z1.b, z6.b[1]\n"
+      "sdot z18.s, z1.b, z5.b[1]\n"
+      "sdot z22.s, z1.b, z4.b[1]\n"
+      "sdot z26.s, z1.b, z3.b[1]\n"
+      "sdot z30.s, z1.b, z2.b[1]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "sdot z11.s, z0.b, z7.b[1]\n"
+      "sdot z15.s, z0.b, z6.b[1]\n"
+      "sdot z19.s, z0.b, z5.b[1]\n"
+      "sdot z23.s, z0.b, z4.b[1]\n"
+      "sdot z27.s, z0.b, z3.b[1]\n"
+      "sdot z31.s, z0.b, z2.b[1]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[2]\n"
+      "sdot z12.s, z1.b, z6.b[2]\n"
+      "sdot z16.s, z1.b, z5.b[2]\n"
+      "sdot z20.s, z1.b, z4.b[2]\n"
+      "sdot z24.s, z1.b, z3.b[2]\n"
+      "sdot z28.s, z1.b, z2.b[2]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "sdot z9.s, z0.b, z7.b[2]\n"
+      "sdot z13.s, z0.b, z6.b[2]\n"
+      "sdot z17.s, z0.b, z5.b[2]\n"
+      "sdot z21.s, z0.b, z4.b[2]\n"
+      "sdot z25.s, z0.b, z3.b[2]\n"
+      "sdot z29.s, z0.b, z2.b[2]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "sdot z10.s, z1.b, z7.b[2]\n"
+      "sdot z14.s, z1.b, z6.b[2]\n"
+      "sdot z18.s, z1.b, z5.b[2]\n"
+      "sdot z22.s, z1.b, z4.b[2]\n"
+      "sdot z26.s, z1.b, z3.b[2]\n"
+      "sdot z30.s, z1.b, z2.b[2]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "sdot z11.s, z0.b, z7.b[2]\n"
+      "sdot z15.s, z0.b, z6.b[2]\n"
+      "sdot z19.s, z0.b, z5.b[2]\n"
+      "sdot z23.s, z0.b, z4.b[2]\n"
+      "sdot z27.s, z0.b, z3.b[2]\n"
+      "sdot z31.s, z0.b, z2.b[2]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "sdot z8.s, z1.b, z7.b[3]\n"
+      "sdot z12.s, z1.b, z6.b[3]\n"
+      "sdot z16.s, z1.b, z5.b[3]\n"
+      "sdot z20.s, z1.b, z4.b[3]\n"
+      "sdot z24.s, z1.b, z3.b[3]\n"
+      "sdot z28.s, z1.b, z2.b[3]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "sdot z9.s, z0.b, z7.b[3]\n"
+      "sdot z13.s, z0.b, z6.b[3]\n"
+      "sdot z17.s, z0.b, z5.b[3]\n"
+      "sdot z21.s, z0.b, z4.b[3]\n"
+      "sdot z25.s, z0.b, z3.b[3]\n"
+      "sdot z29.s, z0.b, z2.b[3]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "sdot z10.s, z1.b, z7.b[3]\n"
+      "sdot z14.s, z1.b, z6.b[3]\n"
+      "sdot z18.s, z1.b, z5.b[3]\n"
+      "sdot z22.s, z1.b, z4.b[3]\n"
+      "sdot z26.s, z1.b, z3.b[3]\n"
+      "sdot z30.s, z1.b, z2.b[3]\n"
+      "sdot z11.s, z0.b, z7.b[3]\n"
+      "sdot z15.s, z0.b, z6.b[3]\n"
+      "sdot z19.s, z0.b, z5.b[3]\n"
+      "sdot z23.s, z0.b, z4.b[3]\n"
+      "sdot z27.s, z0.b, z3.b[3]\n"
+      "sdot z31.s, z0.b, z2.b[3]\n"
       "bgt 63b\n"
       "64:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
@@ -1575,127 +1575,127 @@
       "ld1rqb { z3.b }, p0/Z, [x23]\n"
       "ld1rqb { z4.b }, p0/Z, [x22]\n"
       "ld1rqb { z5.b }, p0/Z, [x21]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[0]\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "sdot z20.s, z6.b, z3.b[0]\n"
-      "sdot z24.s, z6.b, z4.b[0]\n"
-      "sdot z28.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[0]\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "sdot z21.s, z7.b, z3.b[0]\n"
-      "sdot z25.s, z7.b, z4.b[0]\n"
-      "sdot z29.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[0]\n"
+      "sdot z12.s, z7.b, z1.b[0]\n"
+      "sdot z16.s, z7.b, z2.b[0]\n"
+      "sdot z20.s, z7.b, z3.b[0]\n"
+      "sdot z24.s, z7.b, z4.b[0]\n"
+      "sdot z28.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[0]\n"
+      "sdot z13.s, z6.b, z1.b[0]\n"
+      "sdot z17.s, z6.b, z2.b[0]\n"
+      "sdot z21.s, z6.b, z3.b[0]\n"
+      "sdot z25.s, z6.b, z4.b[0]\n"
+      "sdot z29.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[0]\n"
-      "sdot z14.s, z6.b, z1.b[0]\n"
-      "sdot z18.s, z6.b, z2.b[0]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z26.s, z6.b, z4.b[0]\n"
-      "sdot z30.s, z6.b, z5.b[0]\n"
-      "sdot z11.s, z7.b, z0.b[0]\n"
-      "sdot z15.s, z7.b, z1.b[0]\n"
-      "sdot z19.s, z7.b, z2.b[0]\n"
-      "sdot z23.s, z7.b, z3.b[0]\n"
-      "sdot z27.s, z7.b, z4.b[0]\n"
-      "sdot z31.s, z7.b, z5.b[0]\n"
+      "sdot z10.s, z7.b, z0.b[0]\n"
+      "sdot z14.s, z7.b, z1.b[0]\n"
+      "sdot z18.s, z7.b, z2.b[0]\n"
+      "sdot z22.s, z7.b, z3.b[0]\n"
+      "sdot z26.s, z7.b, z4.b[0]\n"
+      "sdot z30.s, z7.b, z5.b[0]\n"
+      "sdot z11.s, z6.b, z0.b[0]\n"
+      "sdot z15.s, z6.b, z1.b[0]\n"
+      "sdot z19.s, z6.b, z2.b[0]\n"
+      "sdot z23.s, z6.b, z3.b[0]\n"
+      "sdot z27.s, z6.b, z4.b[0]\n"
+      "sdot z31.s, z6.b, z5.b[0]\n"
       "ble 65f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[1]\n"
-      "sdot z12.s, z6.b, z1.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[1]\n"
-      "sdot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[1]\n"
+      "sdot z12.s, z7.b, z1.b[1]\n"
+      "sdot z16.s, z7.b, z2.b[1]\n"
+      "sdot z20.s, z7.b, z3.b[1]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z24.s, z6.b, z4.b[1]\n"
-      "sdot z28.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[1]\n"
-      "sdot z13.s, z7.b, z1.b[1]\n"
-      "sdot z17.s, z7.b, z2.b[1]\n"
-      "sdot z21.s, z7.b, z3.b[1]\n"
-      "sdot z25.s, z7.b, z4.b[1]\n"
-      "sdot z29.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z24.s, z7.b, z4.b[1]\n"
+      "sdot z28.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[1]\n"
+      "sdot z13.s, z6.b, z1.b[1]\n"
+      "sdot z17.s, z6.b, z2.b[1]\n"
+      "sdot z21.s, z6.b, z3.b[1]\n"
+      "sdot z25.s, z6.b, z4.b[1]\n"
+      "sdot z29.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[1]\n"
-      "sdot z14.s, z6.b, z1.b[1]\n"
-      "sdot z18.s, z6.b, z2.b[1]\n"
-      "sdot z22.s, z6.b, z3.b[1]\n"
-      "sdot z26.s, z6.b, z4.b[1]\n"
-      "sdot z30.s, z6.b, z5.b[1]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z15.s, z7.b, z1.b[1]\n"
-      "sdot z19.s, z7.b, z2.b[1]\n"
-      "sdot z23.s, z7.b, z3.b[1]\n"
-      "sdot z27.s, z7.b, z4.b[1]\n"
-      "sdot z31.s, z7.b, z5.b[1]\n"
+      "sdot z10.s, z7.b, z0.b[1]\n"
+      "sdot z14.s, z7.b, z1.b[1]\n"
+      "sdot z18.s, z7.b, z2.b[1]\n"
+      "sdot z22.s, z7.b, z3.b[1]\n"
+      "sdot z26.s, z7.b, z4.b[1]\n"
+      "sdot z30.s, z7.b, z5.b[1]\n"
+      "sdot z11.s, z6.b, z0.b[1]\n"
+      "sdot z15.s, z6.b, z1.b[1]\n"
+      "sdot z19.s, z6.b, z2.b[1]\n"
+      "sdot z23.s, z6.b, z3.b[1]\n"
+      "sdot z27.s, z6.b, z4.b[1]\n"
+      "sdot z31.s, z6.b, z5.b[1]\n"
       "ble 65f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[2]\n"
-      "sdot z12.s, z6.b, z1.b[2]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[2]\n"
+      "sdot z12.s, z7.b, z1.b[2]\n"
+      "sdot z16.s, z7.b, z2.b[2]\n"
+      "sdot z20.s, z7.b, z3.b[2]\n"
       "subs x27, x27, #0x4\n"
-      "sdot z24.s, z6.b, z4.b[2]\n"
-      "sdot z28.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[2]\n"
-      "sdot z13.s, z7.b, z1.b[2]\n"
-      "sdot z17.s, z7.b, z2.b[2]\n"
-      "sdot z21.s, z7.b, z3.b[2]\n"
-      "sdot z25.s, z7.b, z4.b[2]\n"
-      "sdot z29.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "sdot z24.s, z7.b, z4.b[2]\n"
+      "sdot z28.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[2]\n"
+      "sdot z13.s, z6.b, z1.b[2]\n"
+      "sdot z17.s, z6.b, z2.b[2]\n"
+      "sdot z21.s, z6.b, z3.b[2]\n"
+      "sdot z25.s, z6.b, z4.b[2]\n"
+      "sdot z29.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[2]\n"
-      "sdot z14.s, z6.b, z1.b[2]\n"
-      "sdot z18.s, z6.b, z2.b[2]\n"
-      "sdot z22.s, z6.b, z3.b[2]\n"
-      "sdot z26.s, z6.b, z4.b[2]\n"
-      "sdot z30.s, z6.b, z5.b[2]\n"
-      "sdot z11.s, z7.b, z0.b[2]\n"
-      "sdot z15.s, z7.b, z1.b[2]\n"
-      "sdot z19.s, z7.b, z2.b[2]\n"
-      "sdot z23.s, z7.b, z3.b[2]\n"
-      "sdot z27.s, z7.b, z4.b[2]\n"
-      "sdot z31.s, z7.b, z5.b[2]\n"
+      "sdot z10.s, z7.b, z0.b[2]\n"
+      "sdot z14.s, z7.b, z1.b[2]\n"
+      "sdot z18.s, z7.b, z2.b[2]\n"
+      "sdot z22.s, z7.b, z3.b[2]\n"
+      "sdot z26.s, z7.b, z4.b[2]\n"
+      "sdot z30.s, z7.b, z5.b[2]\n"
+      "sdot z11.s, z6.b, z0.b[2]\n"
+      "sdot z15.s, z6.b, z1.b[2]\n"
+      "sdot z19.s, z6.b, z2.b[2]\n"
+      "sdot z23.s, z6.b, z3.b[2]\n"
+      "sdot z27.s, z6.b, z4.b[2]\n"
+      "sdot z31.s, z6.b, z5.b[2]\n"
       "ble 65f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "sdot z8.s, z6.b, z0.b[3]\n"
-      "sdot z12.s, z6.b, z1.b[3]\n"
-      "sdot z16.s, z6.b, z2.b[3]\n"
-      "sdot z20.s, z6.b, z3.b[3]\n"
-      "sdot z24.s, z6.b, z4.b[3]\n"
-      "sdot z28.s, z6.b, z5.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "sdot z9.s, z7.b, z0.b[3]\n"
-      "sdot z13.s, z7.b, z1.b[3]\n"
-      "sdot z17.s, z7.b, z2.b[3]\n"
-      "sdot z21.s, z7.b, z3.b[3]\n"
-      "sdot z25.s, z7.b, z4.b[3]\n"
-      "sdot z29.s, z7.b, z5.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "sdot z8.s, z7.b, z0.b[3]\n"
+      "sdot z12.s, z7.b, z1.b[3]\n"
+      "sdot z16.s, z7.b, z2.b[3]\n"
+      "sdot z20.s, z7.b, z3.b[3]\n"
+      "sdot z24.s, z7.b, z4.b[3]\n"
+      "sdot z28.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "sdot z9.s, z6.b, z0.b[3]\n"
+      "sdot z13.s, z6.b, z1.b[3]\n"
+      "sdot z17.s, z6.b, z2.b[3]\n"
+      "sdot z21.s, z6.b, z3.b[3]\n"
+      "sdot z25.s, z6.b, z4.b[3]\n"
+      "sdot z29.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "sdot z10.s, z6.b, z0.b[3]\n"
-      "sdot z14.s, z6.b, z1.b[3]\n"
-      "sdot z18.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[3]\n"
-      "sdot z26.s, z6.b, z4.b[3]\n"
-      "sdot z30.s, z6.b, z5.b[3]\n"
-      "sdot z11.s, z7.b, z0.b[3]\n"
-      "sdot z15.s, z7.b, z1.b[3]\n"
-      "sdot z19.s, z7.b, z2.b[3]\n"
-      "sdot z23.s, z7.b, z3.b[3]\n"
-      "sdot z27.s, z7.b, z4.b[3]\n"
-      "sdot z31.s, z7.b, z5.b[3]\n"
+      "sdot z10.s, z7.b, z0.b[3]\n"
+      "sdot z14.s, z7.b, z1.b[3]\n"
+      "sdot z18.s, z7.b, z2.b[3]\n"
+      "sdot z22.s, z7.b, z3.b[3]\n"
+      "sdot z26.s, z7.b, z4.b[3]\n"
+      "sdot z30.s, z7.b, z5.b[3]\n"
+      "sdot z11.s, z6.b, z0.b[3]\n"
+      "sdot z15.s, z6.b, z1.b[3]\n"
+      "sdot z19.s, z6.b, z2.b[3]\n"
+      "sdot z23.s, z6.b, z3.b[3]\n"
+      "sdot z27.s, z6.b, z4.b[3]\n"
+      "sdot z31.s, z6.b, z5.b[3]\n"
       "65:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1748,7 +1748,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "68:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1756,4 +1755,4 @@
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
index c089775..6862954 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -74,7 +74,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, int32_t>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -86,7 +85,6 @@
             }
         }
 
-
         if (std::is_same<T, int8_t>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -111,5 +109,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp
index 3504256..f66b634 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp
@@ -100,16 +100,16 @@
       "incw x20\n"
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 3f\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
       "b 4f\n"
@@ -127,11 +127,11 @@
       "5:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 6f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 7f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -143,86 +143,86 @@
       "ble 9f\n"
       "8:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1rqb { z20.b }, p0/Z, [x26]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45119a88  // smmla z8.s, z20.b, z17.b\n"
+      ".inst 0x45109a8c  // smmla z12.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45119a89  // smmla z9.s, z20.b, z17.b\n"
+      ".inst 0x45109a8d  // smmla z13.s, z20.b, z16.b\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45109a8a  // smmla z10.s, z20.b, z16.b\n"
+      ".inst 0x45079a8e  // smmla z14.s, z20.b, z7.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
       "sub x27, x27, #0x10\n"
       "cmp x27, #0x10\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45119a8b  // smmla z11.s, z20.b, z17.b\n"
+      ".inst 0x45109a8f  // smmla z15.s, z20.b, z16.b\n"
       "add x26, x26, #0x10\n"
       "bgt 8b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
       "addvl x10, x10, #8\n"
       "ble 10f\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45119828  // smmla z8.s, z1.b, z17.b\n"
+      ".inst 0x4510982c  // smmla z12.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45119829  // smmla z9.s, z1.b, z17.b\n"
+      ".inst 0x4510982d  // smmla z13.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x4511982a  // smmla z10.s, z1.b, z17.b\n"
+      ".inst 0x4510982e  // smmla z14.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x4511982b  // smmla z11.s, z1.b, z17.b\n"
+      ".inst 0x4510982f  // smmla z15.s, z1.b, z16.b\n"
       "addvl x10, x10, #8\n"
       "10:"  // Height 1: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -258,21 +258,21 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 14f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z18.s }, p4/Z, [x9]\n"
+      "ld1w { z2.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "zip1 z8.d, z18.d, z12.d\n"
+      "zip2 z12.d, z18.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z9.d, z2.d, z13.d\n"
+      "zip2 z13.d, z2.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
       "b 15f\n"
@@ -290,12 +290,12 @@
       "16:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 17f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 18f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -303,95 +303,95 @@
       "b 18f\n"
       "17:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
+      "add x25, x26, x21\n"
       "18:"  // Height 2: input setup done
       "cmp x27, #0x10\n"
       "ble 20f\n"
       "19:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1rqb { z20.b }, p0/Z, [x26]\n"
+      "ld1rqb { z19.b }, p0/Z, [x25]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45119a88  // smmla z8.s, z20.b, z17.b\n"
+      ".inst 0x45109a8c  // smmla z12.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45119a89  // smmla z9.s, z20.b, z17.b\n"
+      ".inst 0x45109a8d  // smmla z13.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45119a8a  // smmla z10.s, z20.b, z17.b\n"
+      ".inst 0x45109a8e  // smmla z14.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
       "sub x27, x27, #0x10\n"
       "cmp x27, #0x10\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45119a8b  // smmla z11.s, z20.b, z17.b\n"
+      ".inst 0x45109a8f  // smmla z15.s, z20.b, z16.b\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "bgt 19b\n"
       "20:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1rqb { z19.b }, p0/Z, [x25]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45119a48  // smmla z8.s, z18.b, z17.b\n"
+      ".inst 0x45109a4c  // smmla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45119a49  // smmla z9.s, z18.b, z17.b\n"
+      ".inst 0x45109a4d  // smmla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45119a4a  // smmla z10.s, z18.b, z17.b\n"
+      ".inst 0x45109a4e  // smmla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x45119a4b  // smmla z11.s, z18.b, z17.b\n"
+      ".inst 0x45109a4f  // smmla z15.s, z18.b, z16.b\n"
       "addvl x10, x10, #8\n"
       "ble 21f\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45119828  // smmla z8.s, z1.b, z17.b\n"
+      ".inst 0x4510982c  // smmla z12.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45119829  // smmla z9.s, z1.b, z17.b\n"
+      ".inst 0x4510982d  // smmla z13.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x4511982a  // smmla z10.s, z1.b, z17.b\n"
+      ".inst 0x4510982e  // smmla z14.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x4511982b  // smmla z11.s, z1.b, z17.b\n"
+      ".inst 0x4510982f  // smmla z15.s, z1.b, z16.b\n"
       "addvl x10, x10, #8\n"
       "21:"  // Height 2: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -399,24 +399,24 @@
       "cmp x28, x20\n"
       "bne 16b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
+      "add x20, x9, x20, LSL #2\n"
+      "uzp1 z16.d, z8.d, z12.d\n"
       "uzp2 z8.d, z8.d, z12.d\n"
-      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp1 z17.d, z9.d, z13.d\n"
       "uzp2 z9.d, z9.d, z13.d\n"
-      "st1w { z7.s }, p4, [x9]\n"
-      "uzp1 z13.d, z10.d, z14.d\n"
+      "st1w { z16.s }, p4, [x9]\n"
+      "uzp1 z16.d, z10.d, z14.d\n"
       "uzp2 z10.d, z10.d, z14.d\n"
-      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
-      "uzp1 z14.d, z11.d, z15.d\n"
+      "st1w { z17.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp1 z2.d, z11.d, z15.d\n"
       "uzp2 z11.d, z11.d, z15.d\n"
-      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
-      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "st1w { z16.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z8.s }, p4, [x24]\n"
-      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z8.s }, p4, [x20]\n"
+      "st1w { z9.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x20, #3, MUL VL]\n"
       "22:"  // Height 2: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -437,28 +437,28 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 25f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x23]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x20]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
       "zip1 z17.d, z18.d, z21.d\n"
@@ -490,13 +490,13 @@
       "27:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 28f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 29f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -505,169 +505,169 @@
       "b 29f\n"
       "28:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "29:"  // Height 3: input setup done
       "cmp x27, #0x10\n"
       "ble 31f\n"
       "30:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "ld1rqb { z30.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "ld1rqb { z28.b }, p0/Z, [x24]\n"
+      "trn1 z27.d, z30.d, z24.d\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "trn1 z26.d, z28.d, z29.d\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45199b68  // smmla z8.s, z27.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189b6c  // smmla z12.s, z27.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45199b69  // smmla z9.s, z27.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z29.d\n"
+      ".inst 0x45189b6d  // smmla z13.s, z27.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45199b6a  // smmla z10.s, z27.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
       "cmp x27, #0x10\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45189b6e  // smmla z14.s, z27.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x45199b6b  // smmla z11.s, z27.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45189b6f  // smmla z15.s, z27.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x45199bc8  // smmla z8.s, z30.b, z25.b\n"
+      ".inst 0x45199b90  // smmla z16.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x45189bcc  // smmla z12.s, z30.b, z24.b\n"
+      ".inst 0x45189b94  // smmla z20.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45199bc9  // smmla z9.s, z30.b, z25.b\n"
+      ".inst 0x45199b91  // smmla z17.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x45189bcd  // smmla z13.s, z30.b, z24.b\n"
+      ".inst 0x45189b95  // smmla z21.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45199bca  // smmla z10.s, z30.b, z25.b\n"
+      ".inst 0x45199b92  // smmla z18.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x45189bce  // smmla z14.s, z30.b, z24.b\n"
+      ".inst 0x45189b96  // smmla z22.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x45199bcb  // smmla z11.s, z30.b, z25.b\n"
+      ".inst 0x45199b93  // smmla z19.s, z28.b, z25.b\n"
+      ".inst 0x45189bcf  // smmla z15.s, z30.b, z24.b\n"
+      ".inst 0x45189b97  // smmla z23.s, z28.b, z24.b\n"
       "bgt 30b\n"
       "31:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
       "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn1 z27.d, z1.d, z24.d\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "trn1 z26.d, z3.d, z28.d\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45199b68  // smmla z8.s, z27.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189b6c  // smmla z12.s, z27.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45199b69  // smmla z9.s, z27.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45189b6d  // smmla z13.s, z27.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z28.d\n"
+      ".inst 0x45199b6a  // smmla z10.s, z27.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45189b6e  // smmla z14.s, z27.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      ".inst 0x45199b6b  // smmla z11.s, z27.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      ".inst 0x45189b6f  // smmla z15.s, z27.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
       "ble 32f\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45199828  // smmla z8.s, z1.b, z25.b\n"
+      ".inst 0x45199870  // smmla z16.s, z3.b, z25.b\n"
+      ".inst 0x4518982c  // smmla z12.s, z1.b, z24.b\n"
+      ".inst 0x45189874  // smmla z20.s, z3.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45199829  // smmla z9.s, z1.b, z25.b\n"
+      ".inst 0x45199871  // smmla z17.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x4518982d  // smmla z13.s, z1.b, z24.b\n"
+      ".inst 0x45189875  // smmla z21.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x4519982a  // smmla z10.s, z1.b, z25.b\n"
+      ".inst 0x45199872  // smmla z18.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x4518982e  // smmla z14.s, z1.b, z24.b\n"
+      ".inst 0x45189876  // smmla z22.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x4519982b  // smmla z11.s, z1.b, z25.b\n"
+      ".inst 0x45199873  // smmla z19.s, z3.b, z25.b\n"
+      ".inst 0x4518982f  // smmla z15.s, z1.b, z24.b\n"
+      ".inst 0x45189877  // smmla z23.s, z3.b, z24.b\n"
       "32:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 27b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "uzp1 z25.d, z8.d, z12.d\n"
       "uzp2 z8.d, z8.d, z12.d\n"
-      "uzp1 z12.d, z9.d, z13.d\n"
-      "st1w { z7.s }, p4, [x9]\n"
+      "uzp1 z24.d, z9.d, z13.d\n"
+      "st1w { z25.s }, p4, [x9]\n"
       "uzp2 z9.d, z9.d, z13.d\n"
-      "uzp1 z13.d, z10.d, z14.d\n"
-      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp1 z25.d, z10.d, z14.d\n"
+      "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
       "uzp2 z10.d, z10.d, z14.d\n"
-      "uzp1 z14.d, z11.d, z15.d\n"
-      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "uzp1 z24.d, z11.d, z15.d\n"
+      "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
       "uzp2 z11.d, z11.d, z15.d\n"
       "uzp1 z16.d, z16.d, z20.d\n"
-      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
       "uzp1 z17.d, z17.d, z21.d\n"
       "uzp1 z18.d, z18.d, z22.d\n"
-      "st1w { z8.s }, p4, [x24]\n"
+      "st1w { z8.s }, p4, [x21]\n"
       "uzp1 z19.d, z19.d, z23.d\n"
-      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z9.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x20]\n"
+      "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
       "33:"  // Height 3: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -688,37 +688,37 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 36f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "add x22, x23, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x23]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x21]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
-      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z17.d, z18.d, z21.d\n"
       "zip2 z21.d, z18.d, z21.d\n"
-      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z18.d, z19.d, z22.d\n"
       "zip2 z22.d, z19.d, z22.d\n"
       "zip1 z19.d, z24.d, z23.d\n"
@@ -746,14 +746,14 @@
       "38:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 39f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 40f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -763,182 +763,182 @@
       "b 40f\n"
       "39:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "40:"  // Height 4: input setup done
       "cmp x27, #0x10\n"
       "ble 42f\n"
       "41:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "ld1rqb { z30.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "trn1 z29.d, z30.d, z24.d\n"
+      "ld1rqb { z28.b }, p0/Z, [x24]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "trn1 z26.d, z28.d, z27.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45199ba8  // smmla z8.s, z29.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189bac  // smmla z12.s, z29.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45199ba9  // smmla z9.s, z29.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z27.d\n"
+      ".inst 0x45189bad  // smmla z13.s, z29.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45199baa  // smmla z10.s, z29.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
       "cmp x27, #0x10\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45189bae  // smmla z14.s, z29.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x45199bab  // smmla z11.s, z29.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45189baf  // smmla z15.s, z29.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x45199bc8  // smmla z8.s, z30.b, z25.b\n"
+      ".inst 0x45199b90  // smmla z16.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45189bcc  // smmla z12.s, z30.b, z24.b\n"
+      ".inst 0x45189b94  // smmla z20.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x45199bc9  // smmla z9.s, z30.b, z25.b\n"
+      ".inst 0x45199b91  // smmla z17.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x45189bcd  // smmla z13.s, z30.b, z24.b\n"
+      ".inst 0x45189b95  // smmla z21.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45199bca  // smmla z10.s, z30.b, z25.b\n"
+      ".inst 0x45199b92  // smmla z18.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x45189bce  // smmla z14.s, z30.b, z24.b\n"
+      ".inst 0x45189b96  // smmla z22.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x45199bcb  // smmla z11.s, z30.b, z25.b\n"
+      ".inst 0x45199b93  // smmla z19.s, z28.b, z25.b\n"
+      ".inst 0x45189bcf  // smmla z15.s, z30.b, z24.b\n"
+      ".inst 0x45189b97  // smmla z23.s, z28.b, z24.b\n"
       "bgt 41b\n"
       "42:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "trn1 z28.d, z1.d, z24.d\n"
       "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "trn1 z26.d, z3.d, z27.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45199b88  // smmla z8.s, z28.b, z25.b\n"
+      ".inst 0x45199b50  // smmla z16.s, z26.b, z25.b\n"
+      ".inst 0x45189b8c  // smmla z12.s, z28.b, z24.b\n"
+      ".inst 0x45189b54  // smmla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45199b89  // smmla z9.s, z28.b, z25.b\n"
+      ".inst 0x45199b51  // smmla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45189b8d  // smmla z13.s, z28.b, z24.b\n"
+      ".inst 0x45189b55  // smmla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z27.d\n"
+      ".inst 0x45199b8a  // smmla z10.s, z28.b, z25.b\n"
+      ".inst 0x45199b52  // smmla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45189b8e  // smmla z14.s, z28.b, z24.b\n"
+      ".inst 0x45189b56  // smmla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      ".inst 0x45199b8b  // smmla z11.s, z28.b, z25.b\n"
+      ".inst 0x45199b53  // smmla z19.s, z26.b, z25.b\n"
+      ".inst 0x45189b8f  // smmla z15.s, z28.b, z24.b\n"
+      ".inst 0x45189b57  // smmla z23.s, z26.b, z24.b\n"
       "ble 43f\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45199828  // smmla z8.s, z1.b, z25.b\n"
+      ".inst 0x45199870  // smmla z16.s, z3.b, z25.b\n"
+      ".inst 0x4518982c  // smmla z12.s, z1.b, z24.b\n"
+      ".inst 0x45189874  // smmla z20.s, z3.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45199829  // smmla z9.s, z1.b, z25.b\n"
+      ".inst 0x45199871  // smmla z17.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x4518982d  // smmla z13.s, z1.b, z24.b\n"
+      ".inst 0x45189875  // smmla z21.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x4519982a  // smmla z10.s, z1.b, z25.b\n"
+      ".inst 0x45199872  // smmla z18.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x4518982e  // smmla z14.s, z1.b, z24.b\n"
+      ".inst 0x45189876  // smmla z22.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x4519982b  // smmla z11.s, z1.b, z25.b\n"
+      ".inst 0x45199873  // smmla z19.s, z3.b, z25.b\n"
+      ".inst 0x4518982f  // smmla z15.s, z1.b, z24.b\n"
+      ".inst 0x45189877  // smmla z23.s, z3.b, z24.b\n"
       "43:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 38b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp1 z25.d, z8.d, z12.d\n"
+      "add x20, x21, x20, LSL #2\n"
       "uzp2 z8.d, z8.d, z12.d\n"
-      "uzp1 z12.d, z9.d, z13.d\n"
-      "st1w { z7.s }, p4, [x9]\n"
+      "uzp1 z24.d, z9.d, z13.d\n"
+      "st1w { z25.s }, p4, [x9]\n"
       "uzp2 z9.d, z9.d, z13.d\n"
-      "uzp1 z13.d, z10.d, z14.d\n"
-      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp1 z25.d, z10.d, z14.d\n"
+      "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
       "uzp2 z10.d, z10.d, z14.d\n"
-      "uzp1 z14.d, z11.d, z15.d\n"
-      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "uzp1 z24.d, z11.d, z15.d\n"
+      "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
       "uzp2 z11.d, z11.d, z15.d\n"
-      "uzp1 z15.d, z16.d, z20.d\n"
-      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "uzp1 z25.d, z16.d, z20.d\n"
+      "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
       "uzp2 z16.d, z16.d, z20.d\n"
-      "uzp1 z20.d, z17.d, z21.d\n"
-      "st1w { z8.s }, p4, [x24]\n"
+      "uzp1 z24.d, z17.d, z21.d\n"
+      "st1w { z8.s }, p4, [x22]\n"
       "uzp2 z17.d, z17.d, z21.d\n"
       "uzp1 z21.d, z18.d, z22.d\n"
-      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z9.s }, p3, [x22, #1, MUL VL]\n"
       "uzp2 z18.d, z18.d, z22.d\n"
-      "uzp1 z22.d, z19.d, z23.d\n"
-      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "uzp1 z20.d, z19.d, z23.d\n"
+      "st1w { z10.s }, p2, [x22, #2, MUL VL]\n"
       "uzp2 z19.d, z19.d, z23.d\n"
-      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z15.s }, p4, [x23]\n"
-      "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x22]\n"
-      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z11.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z25.s }, p4, [x21]\n"
+      "st1w { z24.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z20.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x20]\n"
+      "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
       "44:"  // Height 4: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -959,54 +959,54 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
       "add x21, x22, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x23]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
-      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip1 z17.d, z18.d, z21.d\n"
       "zip2 z21.d, z18.d, z21.d\n"
-      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z25.s }, p4/Z, [x21]\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x20]\n"
       "zip1 z18.d, z19.d, z22.d\n"
       "zip2 z22.d, z19.d, z22.d\n"
-      "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z19.d, z24.d, z23.d\n"
       "zip2 z23.d, z24.d, z23.d\n"
-      "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z24.d, z25.d, z28.d\n"
       "zip2 z28.d, z25.d, z28.d\n"
       "zip1 z25.d, z26.d, z29.d\n"
       "zip2 z29.d, z26.d, z29.d\n"
       "zip1 z26.d, z27.d, z30.d\n"
       "zip2 z30.d, z27.d, z30.d\n"
-      "zip1 z27.d, z6.d, z31.d\n"
-      "zip2 z31.d, z6.d, z31.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
       "b 48f\n"
       "47:"  // Height 5: no accumulate
       "mov z8.s, #0x0\n"
@@ -1038,15 +1038,15 @@
       "49:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 50f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 51f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1057,231 +1057,231 @@
       "b 51f\n"
       "50:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "51:"  // Height 5: input setup done
       "cmp x27, #0x10\n"
       "ble 53f\n"
       "52:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1rqb { z5.b }, p0/Z, [x22]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1rqb { z6.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z7.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z5.d, z6.d, z1.d\n"
+      "trn2 z6.d, z6.d, z1.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "trn1 z3.d, z7.d, z2.d\n"
+      "trn2 z7.d, z7.d, z2.d\n"
+      "ld1b { z1.b }, p5/Z, [x10]\n"
+      "trn1 z2.d, z4.d, z0.d\n"
+      "trn2 z4.d, z4.d, z0.d\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x450198a8  // smmla z8.s, z5.b, z1.b\n"
+      ".inst 0x45019870  // smmla z16.s, z3.b, z1.b\n"
+      ".inst 0x45019858  // smmla z24.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      ".inst 0x450098ac  // smmla z12.s, z5.b, z0.b\n"
+      ".inst 0x45009874  // smmla z20.s, z3.b, z0.b\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x4500985c  // smmla z28.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x450198a9  // smmla z9.s, z5.b, z1.b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45019871  // smmla z17.s, z3.b, z1.b\n"
+      ".inst 0x45019859  // smmla z25.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x450098ad  // smmla z13.s, z5.b, z0.b\n"
+      ".inst 0x45009875  // smmla z21.s, z3.b, z0.b\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x4500985d  // smmla z29.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x450198aa  // smmla z10.s, z5.b, z1.b\n"
+      ".inst 0x45019872  // smmla z18.s, z3.b, z1.b\n"
+      ".inst 0x4501985a  // smmla z26.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x450098ae  // smmla z14.s, z5.b, z0.b\n"
+      ".inst 0x45009876  // smmla z22.s, z3.b, z0.b\n"
+      ".inst 0x4500985e  // smmla z30.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
-      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
-      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      ".inst 0x450198ab  // smmla z11.s, z5.b, z1.b\n"
+      ".inst 0x45019873  // smmla z19.s, z3.b, z1.b\n"
+      ".inst 0x4501985b  // smmla z27.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x450098af  // smmla z15.s, z5.b, z0.b\n"
+      ".inst 0x45009877  // smmla z23.s, z3.b, z0.b\n"
+      ".inst 0x4500985f  // smmla z31.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x450198c8  // smmla z8.s, z6.b, z1.b\n"
+      ".inst 0x450198f0  // smmla z16.s, z7.b, z1.b\n"
+      ".inst 0x45019898  // smmla z24.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x450098cc  // smmla z12.s, z6.b, z0.b\n"
+      ".inst 0x450098f4  // smmla z20.s, z7.b, z0.b\n"
+      ".inst 0x4500989c  // smmla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x450198c9  // smmla z9.s, z6.b, z1.b\n"
+      ".inst 0x450198f1  // smmla z17.s, z7.b, z1.b\n"
+      ".inst 0x45019899  // smmla z25.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x450098cd  // smmla z13.s, z6.b, z0.b\n"
+      ".inst 0x450098f5  // smmla z21.s, z7.b, z0.b\n"
+      ".inst 0x4500989d  // smmla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x450198ca  // smmla z10.s, z6.b, z1.b\n"
+      ".inst 0x450198f2  // smmla z18.s, z7.b, z1.b\n"
+      ".inst 0x4501989a  // smmla z26.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x450098ce  // smmla z14.s, z6.b, z0.b\n"
+      ".inst 0x450098f6  // smmla z22.s, z7.b, z0.b\n"
+      ".inst 0x4500989e  // smmla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x450198cb  // smmla z11.s, z6.b, z1.b\n"
+      ".inst 0x450198f3  // smmla z19.s, z7.b, z1.b\n"
+      ".inst 0x4501989b  // smmla z27.s, z4.b, z1.b\n"
+      ".inst 0x450098cf  // smmla z15.s, z6.b, z0.b\n"
+      ".inst 0x450098f7  // smmla z23.s, z7.b, z0.b\n"
+      ".inst 0x4500989f  // smmla z31.s, z4.b, z0.b\n"
       "bgt 52b\n"
       "53:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
+      "ld1rqb { z4.b }, p0/Z, [x25]\n"
       "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z7.d, z1.d, z4.d\n"
+      "trn2 z1.d, z1.d, z4.d\n"
       "ld1rqb { z5.b }, p0/Z, [x22]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x450298e8  // smmla z8.s, z7.b, z2.b\n"
+      ".inst 0x450298d0  // smmla z16.s, z6.b, z2.b\n"
+      ".inst 0x45029898  // smmla z24.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
-      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
-      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x450098ec  // smmla z12.s, z7.b, z0.b\n"
+      ".inst 0x450098d4  // smmla z20.s, z6.b, z0.b\n"
+      ".inst 0x4500989c  // smmla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x450298e9  // smmla z9.s, z7.b, z2.b\n"
+      ".inst 0x450298d1  // smmla z17.s, z6.b, z2.b\n"
+      ".inst 0x45029899  // smmla z25.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x450098ed  // smmla z13.s, z7.b, z0.b\n"
+      ".inst 0x450098d5  // smmla z21.s, z6.b, z0.b\n"
+      ".inst 0x4500989d  // smmla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x450298ea  // smmla z10.s, z7.b, z2.b\n"
+      ".inst 0x450298d2  // smmla z18.s, z6.b, z2.b\n"
+      ".inst 0x4502989a  // smmla z26.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x450098ee  // smmla z14.s, z7.b, z0.b\n"
+      ".inst 0x450098d6  // smmla z22.s, z6.b, z0.b\n"
+      ".inst 0x4500989e  // smmla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x450298eb  // smmla z11.s, z7.b, z2.b\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
-      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
+      ".inst 0x450298d3  // smmla z19.s, z6.b, z2.b\n"
+      ".inst 0x4502989b  // smmla z27.s, z4.b, z2.b\n"
+      ".inst 0x450098ef  // smmla z15.s, z7.b, z0.b\n"
+      ".inst 0x450098d7  // smmla z23.s, z6.b, z0.b\n"
+      ".inst 0x4500989f  // smmla z31.s, z4.b, z0.b\n"
       "ble 54f\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45029828  // smmla z8.s, z1.b, z2.b\n"
+      ".inst 0x45029870  // smmla z16.s, z3.b, z2.b\n"
+      ".inst 0x450298b8  // smmla z24.s, z5.b, z2.b\n"
+      ".inst 0x4500982c  // smmla z12.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x45009874  // smmla z20.s, z3.b, z0.b\n"
+      ".inst 0x450098bc  // smmla z28.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45029829  // smmla z9.s, z1.b, z2.b\n"
+      ".inst 0x45029871  // smmla z17.s, z3.b, z2.b\n"
+      ".inst 0x450298b9  // smmla z25.s, z5.b, z2.b\n"
+      ".inst 0x4500982d  // smmla z13.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45009875  // smmla z21.s, z3.b, z0.b\n"
+      ".inst 0x450098bd  // smmla z29.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x4502982a  // smmla z10.s, z1.b, z2.b\n"
+      ".inst 0x45029872  // smmla z18.s, z3.b, z2.b\n"
+      ".inst 0x450298ba  // smmla z26.s, z5.b, z2.b\n"
+      ".inst 0x4500982e  // smmla z14.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45009876  // smmla z22.s, z3.b, z0.b\n"
+      ".inst 0x450098be  // smmla z30.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
-      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      ".inst 0x4502982b  // smmla z11.s, z1.b, z2.b\n"
+      ".inst 0x45029873  // smmla z19.s, z3.b, z2.b\n"
+      ".inst 0x450298bb  // smmla z27.s, z5.b, z2.b\n"
+      ".inst 0x4500982f  // smmla z15.s, z1.b, z0.b\n"
+      ".inst 0x45009877  // smmla z23.s, z3.b, z0.b\n"
+      ".inst 0x450098bf  // smmla z31.s, z5.b, z0.b\n"
       "54:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 49b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "uzp1 z2.d, z8.d, z12.d\n"
       "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "uzp2 z8.d, z8.d, z12.d\n"
-      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp1 z1.d, z9.d, z13.d\n"
       "uzp2 z9.d, z9.d, z13.d\n"
-      "uzp1 z13.d, z10.d, z14.d\n"
-      "st1w { z7.s }, p4, [x9]\n"
+      "uzp1 z0.d, z10.d, z14.d\n"
+      "st1w { z2.s }, p4, [x9]\n"
       "uzp2 z10.d, z10.d, z14.d\n"
-      "uzp1 z14.d, z11.d, z15.d\n"
-      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp1 z2.d, z11.d, z15.d\n"
+      "st1w { z1.s }, p3, [x9, #1, MUL VL]\n"
       "uzp2 z11.d, z11.d, z15.d\n"
-      "uzp1 z15.d, z16.d, z20.d\n"
-      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "uzp1 z1.d, z16.d, z20.d\n"
+      "st1w { z0.s }, p2, [x9, #2, MUL VL]\n"
       "uzp2 z16.d, z16.d, z20.d\n"
-      "uzp1 z20.d, z17.d, z21.d\n"
-      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "uzp1 z0.d, z17.d, z21.d\n"
+      "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
       "uzp2 z17.d, z17.d, z21.d\n"
       "uzp1 z21.d, z18.d, z22.d\n"
-      "st1w { z8.s }, p4, [x24]\n"
+      "st1w { z8.s }, p4, [x23]\n"
       "uzp2 z18.d, z18.d, z22.d\n"
-      "uzp1 z22.d, z19.d, z23.d\n"
-      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "uzp1 z20.d, z19.d, z23.d\n"
+      "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
       "uzp2 z19.d, z19.d, z23.d\n"
       "uzp1 z24.d, z24.d, z28.d\n"
-      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
       "uzp1 z25.d, z25.d, z29.d\n"
       "uzp1 z26.d, z26.d, z30.d\n"
-      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
       "uzp1 z27.d, z27.d, z31.d\n"
-      "st1w { z15.s }, p4, [x23]\n"
-      "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x22]\n"
-      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
-      "st1w { z24.s }, p4, [x21]\n"
-      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
-      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z1.s }, p4, [x22]\n"
+      "st1w { z0.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z20.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x21]\n"
+      "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x20]\n"
+      "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
       "55:"  // Height 5: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -1307,26 +1307,26 @@
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x24, x9, x20, LSL #2\n"
       "add x23, x24, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
+      "ld1w { z17.s }, p4/Z, [x9]\n"
       "add x22, x23, x20, LSL #2\n"
       "add x21, x22, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
       "add x20, x21, x20, LSL #2\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
       "ld1w { z12.s }, p4/Z, [x24]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
+      "zip1 z8.d, z17.d, z12.d\n"
       "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
       "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "zip1 z9.d, z10.d, z13.d\n"
+      "zip2 z12.d, z17.d, z12.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
       "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
       "ld1w { z17.s }, p4/Z, [x23]\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "zip1 z10.d, z11.d, z14.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z20.d, z14.d\n"
       "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
       "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "zip2 z14.d, z20.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
       "ld1w { z20.s }, p4/Z, [x22]\n"
@@ -1344,7 +1344,7 @@
       "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip2 z22.d, z19.d, z22.d\n"
       "zip1 z19.d, z24.d, z23.d\n"
-      "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
       "ld1w { z28.s }, p4/Z, [x20]\n"
       "zip2 z23.d, z24.d, z23.d\n"
       "zip1 z24.d, z25.d, z28.d\n"
@@ -1356,8 +1356,8 @@
       "zip2 z29.d, z26.d, z29.d\n"
       "zip1 z26.d, z27.d, z30.d\n"
       "zip2 z30.d, z27.d, z30.d\n"
-      "zip1 z27.d, z6.d, z31.d\n"
-      "zip2 z31.d, z6.d, z31.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
       "b 59f\n"
       "58:"  // Height 6: no accumulate
       "mov z8.s, #0x0\n"
@@ -1389,16 +1389,16 @@
       "60:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 61f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 62f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1410,184 +1410,184 @@
       "b 62f\n"
       "61:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "62:"  // Height 6: input setup done
       "cmp x27, #0x10\n"
       "ble 64f\n"
       "63:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1rqb { z5.b }, p0/Z, [x22]\n"
-      "ld1rqb { z6.b }, p0/Z, [x21]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1rqb { z7.b }, p0/Z, [x26]\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
+      "trn1 z6.d, z7.d, z0.d\n"
+      "ld1rqb { z5.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "trn2 z7.d, z7.d, z0.d\n"
+      "trn1 z4.d, z5.d, z1.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z0.b }, p0/Z, [x21]\n"
+      "trn2 z5.d, z5.d, z1.d\n"
+      "trn1 z2.d, z3.d, z0.d\n"
+      "trn2 z3.d, z3.d, z0.d\n"
+      "ld1b { z1.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x450198c8  // smmla z8.s, z6.b, z1.b\n"
+      ".inst 0x45019890  // smmla z16.s, z4.b, z1.b\n"
+      ".inst 0x45019858  // smmla z24.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      ".inst 0x450098cc  // smmla z12.s, z6.b, z0.b\n"
+      ".inst 0x45009894  // smmla z20.s, z4.b, z0.b\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x4500985c  // smmla z28.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x450198c9  // smmla z9.s, z6.b, z1.b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45019891  // smmla z17.s, z4.b, z1.b\n"
+      ".inst 0x45019859  // smmla z25.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x450098cd  // smmla z13.s, z6.b, z0.b\n"
+      ".inst 0x45009895  // smmla z21.s, z4.b, z0.b\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x4500985d  // smmla z29.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x450198ca  // smmla z10.s, z6.b, z1.b\n"
       "add x21, x21, #0x10\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45019892  // smmla z18.s, z4.b, z1.b\n"
+      ".inst 0x4501985a  // smmla z26.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x450098ce  // smmla z14.s, z6.b, z0.b\n"
+      ".inst 0x45009896  // smmla z22.s, z4.b, z0.b\n"
+      ".inst 0x4500985e  // smmla z30.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
-      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
-      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      ".inst 0x450198cb  // smmla z11.s, z6.b, z1.b\n"
+      ".inst 0x45019893  // smmla z19.s, z4.b, z1.b\n"
+      ".inst 0x4501985b  // smmla z27.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x450098cf  // smmla z15.s, z6.b, z0.b\n"
+      ".inst 0x45009897  // smmla z23.s, z4.b, z0.b\n"
+      ".inst 0x4500985f  // smmla z31.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x450198e8  // smmla z8.s, z7.b, z1.b\n"
+      ".inst 0x450198b0  // smmla z16.s, z5.b, z1.b\n"
+      ".inst 0x45019878  // smmla z24.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x450098ec  // smmla z12.s, z7.b, z0.b\n"
+      ".inst 0x450098b4  // smmla z20.s, z5.b, z0.b\n"
+      ".inst 0x4500987c  // smmla z28.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x450198e9  // smmla z9.s, z7.b, z1.b\n"
+      ".inst 0x450198b1  // smmla z17.s, z5.b, z1.b\n"
+      ".inst 0x45019879  // smmla z25.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x450098ed  // smmla z13.s, z7.b, z0.b\n"
+      ".inst 0x450098b5  // smmla z21.s, z5.b, z0.b\n"
+      ".inst 0x4500987d  // smmla z29.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x450198ea  // smmla z10.s, z7.b, z1.b\n"
+      ".inst 0x450198b2  // smmla z18.s, z5.b, z1.b\n"
+      ".inst 0x4501987a  // smmla z26.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x450098ee  // smmla z14.s, z7.b, z0.b\n"
+      ".inst 0x450098b6  // smmla z22.s, z5.b, z0.b\n"
+      ".inst 0x4500987e  // smmla z30.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x450198eb  // smmla z11.s, z7.b, z1.b\n"
+      ".inst 0x450198b3  // smmla z19.s, z5.b, z1.b\n"
+      ".inst 0x4501987b  // smmla z27.s, z3.b, z1.b\n"
+      ".inst 0x450098ef  // smmla z15.s, z7.b, z0.b\n"
+      ".inst 0x450098b7  // smmla z23.s, z5.b, z0.b\n"
+      ".inst 0x4500987f  // smmla z31.s, z3.b, z0.b\n"
       "bgt 63b\n"
       "64:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
+      "trn1 z7.d, z1.d, z0.d\n"
       "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z0.d\n"
+      "trn1 z6.d, z3.d, z2.d\n"
       "ld1rqb { z5.b }, p0/Z, [x22]\n"
-      "ld1rqb { z6.b }, p0/Z, [x21]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
-      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
-      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x21]\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x450298e8  // smmla z8.s, z7.b, z2.b\n"
+      ".inst 0x450298d0  // smmla z16.s, z6.b, z2.b\n"
+      ".inst 0x45029898  // smmla z24.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
-      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
-      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
-      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
-      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
-      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
-      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x450098ec  // smmla z12.s, z7.b, z0.b\n"
+      ".inst 0x450098d4  // smmla z20.s, z6.b, z0.b\n"
+      ".inst 0x4500989c  // smmla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x450298e9  // smmla z9.s, z7.b, z2.b\n"
+      ".inst 0x450298d1  // smmla z17.s, z6.b, z2.b\n"
+      ".inst 0x45029899  // smmla z25.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x450098ed  // smmla z13.s, z7.b, z0.b\n"
+      ".inst 0x450098d5  // smmla z21.s, z6.b, z0.b\n"
+      ".inst 0x4500989d  // smmla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x450298ea  // smmla z10.s, z7.b, z2.b\n"
+      ".inst 0x450298d2  // smmla z18.s, z6.b, z2.b\n"
+      ".inst 0x4502989a  // smmla z26.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x450098ee  // smmla z14.s, z7.b, z0.b\n"
+      ".inst 0x450098d6  // smmla z22.s, z6.b, z0.b\n"
+      ".inst 0x4500989e  // smmla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x450298eb  // smmla z11.s, z7.b, z2.b\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
-      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
-      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
-      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
-      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
+      ".inst 0x450298d3  // smmla z19.s, z6.b, z2.b\n"
+      ".inst 0x4502989b  // smmla z27.s, z4.b, z2.b\n"
+      ".inst 0x450098ef  // smmla z15.s, z7.b, z0.b\n"
+      ".inst 0x450098d7  // smmla z23.s, z6.b, z0.b\n"
+      ".inst 0x4500989f  // smmla z31.s, z4.b, z0.b\n"
       "ble 65f\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
-      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
-      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
-      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
-      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
-      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
-      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
-      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
-      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
-      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
-      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
-      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45029828  // smmla z8.s, z1.b, z2.b\n"
+      ".inst 0x45029870  // smmla z16.s, z3.b, z2.b\n"
+      ".inst 0x450298b8  // smmla z24.s, z5.b, z2.b\n"
+      ".inst 0x4500982c  // smmla z12.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x45009874  // smmla z20.s, z3.b, z0.b\n"
+      ".inst 0x450098bc  // smmla z28.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45029829  // smmla z9.s, z1.b, z2.b\n"
+      ".inst 0x45029871  // smmla z17.s, z3.b, z2.b\n"
+      ".inst 0x450298b9  // smmla z25.s, z5.b, z2.b\n"
+      ".inst 0x4500982d  // smmla z13.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45009875  // smmla z21.s, z3.b, z0.b\n"
+      ".inst 0x450098bd  // smmla z29.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x4502982a  // smmla z10.s, z1.b, z2.b\n"
+      ".inst 0x45029872  // smmla z18.s, z3.b, z2.b\n"
+      ".inst 0x450298ba  // smmla z26.s, z5.b, z2.b\n"
+      ".inst 0x4500982e  // smmla z14.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45009876  // smmla z22.s, z3.b, z0.b\n"
+      ".inst 0x450098be  // smmla z30.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
-      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
-      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
-      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      ".inst 0x4502982b  // smmla z11.s, z1.b, z2.b\n"
+      ".inst 0x45029873  // smmla z19.s, z3.b, z2.b\n"
+      ".inst 0x450298bb  // smmla z27.s, z5.b, z2.b\n"
+      ".inst 0x4500982f  // smmla z15.s, z1.b, z0.b\n"
+      ".inst 0x45009877  // smmla z23.s, z3.b, z0.b\n"
+      ".inst 0x450098bf  // smmla z31.s, z5.b, z0.b\n"
       "65:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1596,7 +1596,7 @@
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x24, x9, x20, LSL #2\n"
       "add x23, x24, x20, LSL #2\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
+      "uzp1 z0.d, z8.d, z12.d\n"
       "add x22, x23, x20, LSL #2\n"
       "add x21, x22, x20, LSL #2\n"
       "uzp2 z8.d, z8.d, z12.d\n"
@@ -1604,7 +1604,7 @@
       "add x20, x21, x20, LSL #2\n"
       "uzp2 z9.d, z9.d, z13.d\n"
       "uzp1 z13.d, z10.d, z14.d\n"
-      "st1w { z7.s }, p4, [x9]\n"
+      "st1w { z0.s }, p4, [x9]\n"
       "uzp2 z10.d, z10.d, z14.d\n"
       "uzp1 z14.d, z11.d, z15.d\n"
       "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -1664,7 +1664,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "68:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1672,4 +1671,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
index c66ebed..11fe5ce 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -74,7 +74,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, uint8_t>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -97,5 +96,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
index 79bd563..e74b424 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
@@ -104,11 +104,11 @@
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
       "cbnz x26, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -121,39 +121,39 @@
       "7:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x25\n"
       "ld1rqb { z0.b }, p0/Z, [x24]\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "udot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28]\n"
+      "udot z16.s, z20.b, z0.b[0]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "udot z17.s, z21.b, z0.b[0]\n"
+      "udot z18.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z19.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "udot z16.s, z20.b, z0.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "udot z17.s, z21.b, z0.b[1]\n"
+      "udot z18.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      "udot z19.s, z4.b, z0.b[1]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "udot z17.s, z6.b, z0.b[2]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
-      "udot z19.s, z8.b, z0.b[2]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
+      "udot z19.s, z20.b, z0.b[1]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "udot z16.s, z22.b, z0.b[2]\n"
+      "udot z17.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      "udot z18.s, z21.b, z0.b[2]\n"
+      "udot z19.s, z20.b, z0.b[2]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "udot z16.s, z22.b, z0.b[3]\n"
+      "udot z17.s, z20.b, z0.b[3]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "udot z18.s, z21.b, z0.b[3]\n"
+      "udot z19.s, z20.b, z0.b[3]\n"
       "add x24, x24, #0x10\n"
       "tbnz %x[flags], #31, 8f\n"
       "udot z11.s, z0.b, z15.b\n"
@@ -164,47 +164,47 @@
       "9:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x25\n"
       "ld1rqb { z0.b }, p0/Z, [x24]\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
+      "ld1b { z22.b }, p2/Z, [x28]\n"
       "subs x25, x25, #0x4\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "udot z16.s, z22.b, z0.b[0]\n"
+      "udot z17.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z18.s, z21.b, z0.b[0]\n"
+      "udot z19.s, z20.b, z0.b[0]\n"
       "addvl x28, x28, #4\n"
       "ble 10f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x4\n"
-      "udot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
-      "udot z19.s, z4.b, z0.b[1]\n"
+      "udot z16.s, z20.b, z0.b[1]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z17.s, z22.b, z0.b[1]\n"
+      "udot z18.s, z21.b, z0.b[1]\n"
+      "udot z19.s, z20.b, z0.b[1]\n"
       "addvl x28, x28, #4\n"
       "ble 10f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28]\n"
+      "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x4\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z17.s, z6.b, z0.b[2]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
-      "udot z19.s, z8.b, z0.b[2]\n"
+      "udot z16.s, z20.b, z0.b[2]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z17.s, z22.b, z0.b[2]\n"
+      "udot z18.s, z21.b, z0.b[2]\n"
+      "udot z19.s, z20.b, z0.b[2]\n"
       "addvl x28, x28, #4\n"
       "ble 10f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
+      "ld1b { z21.b }, p2/Z, [x28]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z21.b, z0.b[3]\n"
+      "udot z17.s, z20.b, z0.b[3]\n"
+      "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z18.s, z21.b, z0.b[3]\n"
+      "udot z19.s, z20.b, z0.b[3]\n"
       "addvl x28, x28, #4\n"
       "10:"  // Height 1: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 11f\n"
@@ -218,71 +218,71 @@
       "mov x20, #0x4\n"
       "whilelt p0.s, XZR, x20\n"
       "add x20, %x[qp], %[b_offset]\n"
-      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
       "uaddv d11, p0, z11.s\n"
       "mov z11.s, z11.s[0]\n"
-      "neg z1.s, p2/M, z1.s\n"
-      "mul z11.s, p2/M, z11.s, z1.s\n"
+      "neg z20.s, p2/M, z20.s\n"
+      "mul z11.s, p2/M, z11.s, z20.s\n"
       "12:"  // Height 1: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z23.s }, p2/Z, [x10]\n"
+      "ld1w { z22.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "add z16.s, z16.s, z23.s\n"
+      "add z17.s, z17.s, z22.s\n"
+      "add z18.s, z18.s, z21.s\n"
+      "add z19.s, z19.s, z20.s\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
       "ld1rw { z0.s }, p2/Z, [x20]\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04b47610  // sqrdmulh z16.s, z16.s, z20.s\n"
+      ".inst 0x04b47631  // sqrdmulh z17.s, z17.s, z20.s\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04b47652  // sqrdmulh z18.s, z18.s, z20.s\n"
+      ".inst 0x04b47673  // sqrdmulh z19.s, z19.s, z20.s\n"
       "tbz %x[flags], #5, 13f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "and z7.d, z19.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
+      "and z23.d, z16.d, z0.d\n"
+      "and z22.d, z17.d, z0.d\n"
+      "and z21.d, z18.d, z0.d\n"
+      "and z20.d, z19.d, z0.d\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z23.s\n"
+      "sqadd z17.s, z17.s, z22.s\n"
+      "sqadd z18.s, z18.s, z21.s\n"
+      "sqadd z19.s, z19.s, z20.s\n"
       "13:"  // Height 1: no shift correction
       "add x20, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z16.s, z16.s, z20.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z20.s\n"
+      "add z18.s, z18.s, z20.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
       "add x20, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x20]\n"
-      "add z19.s, z19.s, z4.s\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z20.s\n"
       "add x20, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z21.s\n"
+      "smin z17.s, p2/M, z17.s, z21.s\n"
+      "smin z18.s, p2/M, z18.s, z21.s\n"
+      "smin z19.s, p2/M, z19.s, z21.s\n"
+      "smax z16.s, p2/M, z16.s, z20.s\n"
+      "smax z17.s, p2/M, z17.s, z20.s\n"
+      "smax z18.s, p2/M, z18.s, z20.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z20.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
       "st1b { z16.b }, p1, [x27]\n"
@@ -317,12 +317,12 @@
       "18:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 19f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
       "cbnz x26, 20f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -330,7 +330,7 @@
       "b 20f\n"
       "19:"  // Height 2: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
+      "add x23, x24, x21\n"
       "20:"  // Height 2: input setup done
       "cmp x25, #0x10\n"
       "ble 23f\n"
@@ -339,56 +339,56 @@
       "ld1rqb { z0.b }, p0/Z, [x24]\n"
       "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "add x24, x24, #0x10\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "udot z21.s, z5.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "udot z22.s, z6.b, z1.b[0]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
-      "udot z23.s, z7.b, z1.b[0]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z24.b, z0.b[0]\n"
+      "udot z20.s, z24.b, z1.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z17.s, z26.b, z0.b[0]\n"
+      "udot z21.s, z26.b, z1.b[0]\n"
+      "udot z18.s, z24.b, z0.b[0]\n"
+      "udot z22.s, z24.b, z1.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "udot z19.s, z25.b, z0.b[0]\n"
+      "udot z23.s, z25.b, z1.b[0]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      "udot z16.s, z8.b, z0.b[1]\n"
-      "udot z20.s, z8.b, z1.b[1]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
-      "udot z21.s, z9.b, z1.b[1]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
-      "udot z22.s, z10.b, z1.b[1]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "udot z19.s, z4.b, z0.b[1]\n"
-      "udot z23.s, z4.b, z1.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "udot z20.s, z5.b, z1.b[2]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "udot z16.s, z24.b, z0.b[1]\n"
+      "udot z20.s, z24.b, z1.b[1]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "udot z17.s, z27.b, z0.b[1]\n"
+      "udot z21.s, z27.b, z1.b[1]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "udot z18.s, z26.b, z0.b[1]\n"
+      "udot z22.s, z26.b, z1.b[1]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "udot z19.s, z25.b, z0.b[1]\n"
+      "udot z23.s, z25.b, z1.b[1]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "udot z16.s, z24.b, z0.b[2]\n"
+      "udot z20.s, z24.b, z1.b[2]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      "udot z17.s, z6.b, z0.b[2]\n"
-      "udot z21.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
-      "udot z22.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z8.b, z0.b[2]\n"
-      "udot z23.s, z8.b, z1.b[2]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "udot z20.s, z9.b, z1.b[3]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "udot z21.s, z10.b, z1.b[3]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z22.s, z4.b, z1.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
-      "udot z23.s, z5.b, z1.b[3]\n"
+      "udot z17.s, z30.b, z0.b[2]\n"
+      "udot z21.s, z30.b, z1.b[2]\n"
+      "udot z18.s, z29.b, z0.b[2]\n"
+      "udot z22.s, z29.b, z1.b[2]\n"
+      "udot z19.s, z28.b, z0.b[2]\n"
+      "udot z23.s, z28.b, z1.b[2]\n"
+      "udot z16.s, z27.b, z0.b[3]\n"
+      "udot z20.s, z27.b, z1.b[3]\n"
+      "udot z17.s, z26.b, z0.b[3]\n"
+      "udot z21.s, z26.b, z1.b[3]\n"
+      "udot z18.s, z25.b, z0.b[3]\n"
+      "udot z22.s, z25.b, z1.b[3]\n"
+      "udot z19.s, z24.b, z0.b[3]\n"
+      "udot z23.s, z24.b, z1.b[3]\n"
       "tbnz %x[flags], #31, 22f\n"
       "udot z11.s, z0.b, z15.b\n"
       "udot z12.s, z1.b, z15.b\n"
@@ -401,63 +401,63 @@
       "ld1rqb { z0.b }, p0/Z, [x24]\n"
       "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "subs x25, x25, #0x4\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "udot z21.s, z5.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "udot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z24.b, z0.b[0]\n"
+      "udot z20.s, z24.b, z1.b[0]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z17.s, z26.b, z0.b[0]\n"
+      "udot z21.s, z26.b, z1.b[0]\n"
+      "udot z18.s, z25.b, z0.b[0]\n"
+      "udot z22.s, z25.b, z1.b[0]\n"
       "addvl x28, x28, #4\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
-      "udot z23.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z24.b, z0.b[0]\n"
+      "udot z23.s, z24.b, z1.b[0]\n"
       "ble 24f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z27.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x4\n"
-      "udot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z20.s, z8.b, z1.b[1]\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
-      "udot z21.s, z9.b, z1.b[1]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
+      "udot z16.s, z27.b, z0.b[1]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z20.s, z27.b, z1.b[1]\n"
+      "udot z17.s, z26.b, z0.b[1]\n"
+      "udot z21.s, z26.b, z1.b[1]\n"
+      "udot z18.s, z25.b, z0.b[1]\n"
       "addvl x28, x28, #4\n"
-      "udot z22.s, z10.b, z1.b[1]\n"
-      "udot z19.s, z4.b, z0.b[1]\n"
-      "udot z23.s, z4.b, z1.b[1]\n"
+      "udot z22.s, z25.b, z1.b[1]\n"
+      "udot z19.s, z24.b, z0.b[1]\n"
+      "udot z23.s, z24.b, z1.b[1]\n"
       "ble 24f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z27.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x4\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z20.s, z5.b, z1.b[2]\n"
-      "udot z17.s, z6.b, z0.b[2]\n"
-      "udot z21.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
+      "udot z16.s, z27.b, z0.b[2]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z20.s, z27.b, z1.b[2]\n"
+      "udot z17.s, z26.b, z0.b[2]\n"
+      "udot z21.s, z26.b, z1.b[2]\n"
+      "udot z18.s, z25.b, z0.b[2]\n"
       "addvl x28, x28, #4\n"
-      "udot z22.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z8.b, z0.b[2]\n"
-      "udot z23.s, z8.b, z1.b[2]\n"
+      "udot z22.s, z25.b, z1.b[2]\n"
+      "udot z19.s, z24.b, z0.b[2]\n"
+      "udot z23.s, z24.b, z1.b[2]\n"
       "ble 24f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "udot z20.s, z9.b, z1.b[3]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "udot z21.s, z10.b, z1.b[3]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z22.s, z4.b, z1.b[3]\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z24.b, z0.b[3]\n"
+      "udot z20.s, z24.b, z1.b[3]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z17.s, z26.b, z0.b[3]\n"
+      "udot z21.s, z26.b, z1.b[3]\n"
+      "udot z18.s, z25.b, z0.b[3]\n"
+      "udot z22.s, z25.b, z1.b[3]\n"
       "addvl x28, x28, #4\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
-      "udot z23.s, z5.b, z1.b[3]\n"
+      "udot z19.s, z24.b, z0.b[3]\n"
+      "udot z23.s, z24.b, z1.b[3]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 25f\n"
       "udot z11.s, z0.b, z15.b\n"
@@ -473,120 +473,120 @@
       "mov x20, #0x4\n"
       "whilelt p0.s, XZR, x20\n"
       "add x20, %x[qp], %[b_offset]\n"
-      "ld1rw { z2.s }, p2/Z, [x20]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
       "uaddv d11, p0, z11.s\n"
       "mov z11.s, z11.s[0]\n"
       "uaddv d12, p0, z12.s\n"
-      "neg z2.s, p2/M, z2.s\n"
+      "neg z24.s, p2/M, z24.s\n"
       "mov z12.s, z12.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z2.s\n"
-      "mul z12.s, p2/M, z12.s, z2.s\n"
+      "mul z11.s, p2/M, z11.s, z24.s\n"
+      "mul z12.s, p2/M, z12.s, z24.s\n"
       "26:"  // Height 2: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x10]\n"
+      "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z20.s, z20.s, z12.s\n"
       "add z21.s, z21.s, z12.s\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "add z22.s, z22.s, z12.s\n"
       "add z23.s, z23.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
+      "add z16.s, z16.s, z28.s\n"
+      "add z17.s, z17.s, z27.s\n"
       "addvl x10, x10, #4\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
+      "add z18.s, z18.s, z26.s\n"
+      "add z19.s, z19.s, z25.s\n"
+      "add z20.s, z20.s, z28.s\n"
+      "add z21.s, z21.s, z27.s\n"
       "ld1rw { z0.s }, p2/Z, [x20]\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      "add z22.s, z22.s, z26.s\n"
+      "add z23.s, z23.s, z25.s\n"
+      ".inst 0x04b87610  // sqrdmulh z16.s, z16.s, z24.s\n"
+      ".inst 0x04b87631  // sqrdmulh z17.s, z17.s, z24.s\n"
+      ".inst 0x04b87652  // sqrdmulh z18.s, z18.s, z24.s\n"
+      ".inst 0x04b87673  // sqrdmulh z19.s, z19.s, z24.s\n"
+      ".inst 0x04b87694  // sqrdmulh z20.s, z20.s, z24.s\n"
+      ".inst 0x04b876b5  // sqrdmulh z21.s, z21.s, z24.s\n"
+      ".inst 0x04b876d6  // sqrdmulh z22.s, z22.s, z24.s\n"
+      ".inst 0x04b876f7  // sqrdmulh z23.s, z23.s, z24.s\n"
       "tbz %x[flags], #5, 27f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "and z7.d, z19.d, z0.d\n"
-      "and z8.d, z20.d, z0.d\n"
-      "and z9.d, z21.d, z0.d\n"
-      "and z10.d, z22.d, z0.d\n"
-      "and z4.d, z23.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "sqadd z20.s, z20.s, z8.s\n"
-      "sqadd z21.s, z21.s, z9.s\n"
-      "sqadd z22.s, z22.s, z10.s\n"
-      "sqadd z23.s, z23.s, z4.s\n"
+      "and z24.d, z16.d, z0.d\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z24.s\n"
+      "and z30.d, z17.d, z0.d\n"
+      "and z29.d, z18.d, z0.d\n"
+      "and z28.d, z19.d, z0.d\n"
+      "and z27.d, z20.d, z0.d\n"
+      "and z26.d, z21.d, z0.d\n"
+      "and z25.d, z22.d, z0.d\n"
+      "and z24.d, z23.d, z0.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z30.s\n"
+      "sqadd z18.s, z18.s, z29.s\n"
+      "sqadd z19.s, z19.s, z28.s\n"
+      "sqadd z20.s, z20.s, z27.s\n"
+      "sqadd z21.s, z21.s, z26.s\n"
+      "sqadd z22.s, z22.s, z25.s\n"
+      "sqadd z23.s, z23.s, z24.s\n"
       "27:"  // Height 2: no shift correction
       "add x20, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z16.s, z16.s, z24.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z24.s\n"
+      "add z18.s, z18.s, z24.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z20.s, z20.s, z4.s\n"
+      "add z19.s, z19.s, z24.s\n"
+      "add z20.s, z20.s, z24.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
-      "add z21.s, z21.s, z4.s\n"
-      "add z22.s, z22.s, z4.s\n"
+      "add z21.s, z21.s, z24.s\n"
+      "add z22.s, z22.s, z24.s\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
       "add x20, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x20]\n"
-      "add z23.s, z23.s, z4.s\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
+      "add z23.s, z23.s, z24.s\n"
       "add x20, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z25.s\n"
+      "smin z17.s, p2/M, z17.s, z25.s\n"
+      "smin z18.s, p2/M, z18.s, z25.s\n"
+      "smin z19.s, p2/M, z19.s, z25.s\n"
+      "smin z20.s, p2/M, z20.s, z25.s\n"
+      "smin z21.s, p2/M, z21.s, z25.s\n"
+      "smin z22.s, p2/M, z22.s, z25.s\n"
+      "smin z23.s, p2/M, z23.s, z25.s\n"
+      "smax z16.s, p2/M, z16.s, z24.s\n"
+      "smax z17.s, p2/M, z17.s, z24.s\n"
+      "smax z18.s, p2/M, z18.s, z24.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z24.s\n"
+      "smax z20.s, p2/M, z20.s, z24.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z24.s\n"
+      "smax z22.s, p2/M, z22.s, z24.s\n"
       "uzp1 z20.h, z20.h, z21.h\n"
       "st1b { z16.b }, p1, [x27]\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
+      "smax z23.s, p2/M, z23.s, z24.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
       "st1b { z20.b }, p1, [x23]\n"
       "addvl x27, x27, #1\n"
       "28:"  // Height 2: Writeback done
@@ -624,13 +624,13 @@
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
       "cbnz x26, 34f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -639,8 +639,8 @@
       "b 34f\n"
       "33:"  // Height 3: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "34:"  // Height 3: input setup done
       "cmp x25, #0x10\n"
       "ble 37f\n"
@@ -650,73 +650,73 @@
       "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "add x24, x24, #0x10\n"
       "ld1rqb { z2.b }, p0/Z, [x22]\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "udot z24.s, z4.b, z2.b[0]\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z21.s, z5.b, z1.b[0]\n"
-      "udot z25.s, z5.b, z2.b[0]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "udot z22.s, z6.b, z1.b[0]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "udot z26.s, z6.b, z2.b[0]\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x28]\n"
+      "udot z16.s, z28.b, z0.b[0]\n"
+      "udot z20.s, z28.b, z1.b[0]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "udot z24.s, z28.b, z2.b[0]\n"
+      "udot z17.s, z30.b, z0.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z21.s, z30.b, z1.b[0]\n"
+      "udot z25.s, z30.b, z2.b[0]\n"
+      "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "udot z18.s, z29.b, z0.b[0]\n"
+      "udot z22.s, z29.b, z1.b[0]\n"
+      "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "udot z26.s, z29.b, z2.b[0]\n"
+      "udot z19.s, z28.b, z0.b[0]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      "udot z23.s, z7.b, z1.b[0]\n"
-      "udot z27.s, z7.b, z2.b[0]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "udot z16.s, z8.b, z0.b[1]\n"
-      "udot z20.s, z8.b, z1.b[1]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "udot z23.s, z28.b, z1.b[0]\n"
+      "udot z27.s, z28.b, z2.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "udot z16.s, z3.b, z0.b[1]\n"
+      "udot z20.s, z3.b, z1.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-6, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      "udot z24.s, z8.b, z2.b[1]\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      "udot z24.s, z3.b, z2.b[1]\n"
+      "udot z17.s, z31.b, z0.b[1]\n"
+      "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n"
       "add x22, x22, #0x10\n"
-      "udot z21.s, z9.b, z1.b[1]\n"
-      "udot z25.s, z9.b, z2.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
-      "udot z22.s, z10.b, z1.b[1]\n"
-      "udot z26.s, z10.b, z2.b[1]\n"
-      "udot z19.s, z4.b, z0.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "udot z23.s, z4.b, z1.b[1]\n"
-      "udot z27.s, z4.b, z2.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "udot z20.s, z5.b, z1.b[2]\n"
-      "udot z24.s, z5.b, z2.b[2]\n"
-      "udot z17.s, z6.b, z0.b[2]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "udot z21.s, z6.b, z1.b[2]\n"
-      "udot z25.s, z6.b, z2.b[2]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
-      "udot z22.s, z7.b, z1.b[2]\n"
-      "udot z26.s, z7.b, z2.b[2]\n"
-      "udot z19.s, z8.b, z0.b[2]\n"
-      "udot z23.s, z8.b, z1.b[2]\n"
-      "udot z27.s, z8.b, z2.b[2]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "udot z20.s, z9.b, z1.b[3]\n"
-      "udot z24.s, z9.b, z2.b[3]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "udot z21.s, z10.b, z1.b[3]\n"
-      "udot z25.s, z10.b, z2.b[3]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z22.s, z4.b, z1.b[3]\n"
-      "udot z26.s, z4.b, z2.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
-      "udot z23.s, z5.b, z1.b[3]\n"
-      "udot z27.s, z5.b, z2.b[3]\n"
+      "udot z21.s, z31.b, z1.b[1]\n"
+      "udot z25.s, z31.b, z2.b[1]\n"
+      "ld1b { z31.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "udot z18.s, z30.b, z0.b[1]\n"
+      "udot z22.s, z30.b, z1.b[1]\n"
+      "udot z26.s, z30.b, z2.b[1]\n"
+      "udot z19.s, z29.b, z0.b[1]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "udot z23.s, z29.b, z1.b[1]\n"
+      "udot z27.s, z29.b, z2.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "udot z16.s, z28.b, z0.b[2]\n"
+      "udot z20.s, z28.b, z1.b[2]\n"
+      "udot z24.s, z28.b, z2.b[2]\n"
+      "udot z17.s, z5.b, z0.b[2]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "udot z21.s, z5.b, z1.b[2]\n"
+      "udot z25.s, z5.b, z2.b[2]\n"
+      "udot z18.s, z4.b, z0.b[2]\n"
+      "udot z22.s, z4.b, z1.b[2]\n"
+      "udot z26.s, z4.b, z2.b[2]\n"
+      "udot z19.s, z3.b, z0.b[2]\n"
+      "udot z23.s, z3.b, z1.b[2]\n"
+      "udot z27.s, z3.b, z2.b[2]\n"
+      "udot z16.s, z31.b, z0.b[3]\n"
+      "udot z20.s, z31.b, z1.b[3]\n"
+      "udot z24.s, z31.b, z2.b[3]\n"
+      "udot z17.s, z30.b, z0.b[3]\n"
+      "udot z21.s, z30.b, z1.b[3]\n"
+      "udot z25.s, z30.b, z2.b[3]\n"
+      "udot z18.s, z29.b, z0.b[3]\n"
+      "udot z22.s, z29.b, z1.b[3]\n"
+      "udot z26.s, z29.b, z2.b[3]\n"
+      "udot z19.s, z28.b, z0.b[3]\n"
+      "udot z23.s, z28.b, z1.b[3]\n"
+      "udot z27.s, z28.b, z2.b[3]\n"
       "tbnz %x[flags], #31, 36f\n"
       "udot z11.s, z0.b, z15.b\n"
       "udot z12.s, z1.b, z15.b\n"
@@ -731,79 +731,79 @@
       "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "subs x25, x25, #0x4\n"
       "ld1rqb { z2.b }, p0/Z, [x22]\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "udot z24.s, z4.b, z2.b[0]\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z21.s, z5.b, z1.b[0]\n"
-      "udot z25.s, z5.b, z2.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x28]\n"
+      "udot z16.s, z28.b, z0.b[0]\n"
+      "udot z20.s, z28.b, z1.b[0]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "udot z24.s, z28.b, z2.b[0]\n"
+      "udot z17.s, z30.b, z0.b[0]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z21.s, z30.b, z1.b[0]\n"
+      "udot z25.s, z30.b, z2.b[0]\n"
       "addvl x28, x28, #4\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "udot z22.s, z6.b, z1.b[0]\n"
-      "udot z26.s, z6.b, z2.b[0]\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
-      "udot z23.s, z7.b, z1.b[0]\n"
-      "udot z27.s, z7.b, z2.b[0]\n"
+      "udot z18.s, z29.b, z0.b[0]\n"
+      "udot z22.s, z29.b, z1.b[0]\n"
+      "udot z26.s, z29.b, z2.b[0]\n"
+      "udot z19.s, z28.b, z0.b[0]\n"
+      "udot z23.s, z28.b, z1.b[0]\n"
+      "udot z27.s, z28.b, z2.b[0]\n"
       "ble 38f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z31.b }, p2/Z, [x28]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x4\n"
-      "udot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z20.s, z8.b, z1.b[1]\n"
-      "udot z24.s, z8.b, z2.b[1]\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
-      "udot z21.s, z9.b, z1.b[1]\n"
+      "udot z16.s, z31.b, z0.b[1]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z20.s, z31.b, z1.b[1]\n"
+      "udot z24.s, z31.b, z2.b[1]\n"
+      "udot z17.s, z30.b, z0.b[1]\n"
+      "udot z21.s, z30.b, z1.b[1]\n"
       "addvl x28, x28, #4\n"
-      "udot z25.s, z9.b, z2.b[1]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
-      "udot z22.s, z10.b, z1.b[1]\n"
-      "udot z26.s, z10.b, z2.b[1]\n"
-      "udot z19.s, z4.b, z0.b[1]\n"
-      "udot z23.s, z4.b, z1.b[1]\n"
-      "udot z27.s, z4.b, z2.b[1]\n"
+      "udot z25.s, z30.b, z2.b[1]\n"
+      "udot z18.s, z29.b, z0.b[1]\n"
+      "udot z22.s, z29.b, z1.b[1]\n"
+      "udot z26.s, z29.b, z2.b[1]\n"
+      "udot z19.s, z28.b, z0.b[1]\n"
+      "udot z23.s, z28.b, z1.b[1]\n"
+      "udot z27.s, z28.b, z2.b[1]\n"
       "ble 38f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z31.b }, p2/Z, [x28]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x4\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z20.s, z5.b, z1.b[2]\n"
-      "udot z24.s, z5.b, z2.b[2]\n"
-      "udot z17.s, z6.b, z0.b[2]\n"
-      "udot z21.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z31.b, z0.b[2]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z20.s, z31.b, z1.b[2]\n"
+      "udot z24.s, z31.b, z2.b[2]\n"
+      "udot z17.s, z30.b, z0.b[2]\n"
+      "udot z21.s, z30.b, z1.b[2]\n"
       "addvl x28, x28, #4\n"
-      "udot z25.s, z6.b, z2.b[2]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
-      "udot z22.s, z7.b, z1.b[2]\n"
-      "udot z26.s, z7.b, z2.b[2]\n"
-      "udot z19.s, z8.b, z0.b[2]\n"
-      "udot z23.s, z8.b, z1.b[2]\n"
-      "udot z27.s, z8.b, z2.b[2]\n"
+      "udot z25.s, z30.b, z2.b[2]\n"
+      "udot z18.s, z29.b, z0.b[2]\n"
+      "udot z22.s, z29.b, z1.b[2]\n"
+      "udot z26.s, z29.b, z2.b[2]\n"
+      "udot z19.s, z28.b, z0.b[2]\n"
+      "udot z23.s, z28.b, z1.b[2]\n"
+      "udot z27.s, z28.b, z2.b[2]\n"
       "ble 38f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "udot z20.s, z9.b, z1.b[3]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z24.s, z9.b, z2.b[3]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "udot z21.s, z10.b, z1.b[3]\n"
-      "udot z25.s, z10.b, z2.b[3]\n"
+      "ld1b { z31.b }, p2/Z, [x28]\n"
+      "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z31.b, z0.b[3]\n"
+      "udot z20.s, z31.b, z1.b[3]\n"
+      "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z24.s, z31.b, z2.b[3]\n"
+      "udot z17.s, z30.b, z0.b[3]\n"
+      "udot z21.s, z30.b, z1.b[3]\n"
+      "udot z25.s, z30.b, z2.b[3]\n"
       "addvl x28, x28, #4\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z22.s, z4.b, z1.b[3]\n"
-      "udot z26.s, z4.b, z2.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
-      "udot z23.s, z5.b, z1.b[3]\n"
-      "udot z27.s, z5.b, z2.b[3]\n"
+      "udot z18.s, z29.b, z0.b[3]\n"
+      "udot z22.s, z29.b, z1.b[3]\n"
+      "udot z26.s, z29.b, z2.b[3]\n"
+      "udot z19.s, z28.b, z0.b[3]\n"
+      "udot z23.s, z28.b, z1.b[3]\n"
+      "udot z27.s, z28.b, z2.b[3]\n"
       "38:"  // Height 3: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 39f\n"
       "udot z11.s, z0.b, z15.b\n"
@@ -821,33 +821,33 @@
       "mov x20, #0x4\n"
       "whilelt p0.s, XZR, x20\n"
       "add x20, %x[qp], %[b_offset]\n"
-      "ld1rw { z3.s }, p2/Z, [x20]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
       "uaddv d11, p0, z11.s\n"
       "mov z11.s, z11.s[0]\n"
       "uaddv d12, p0, z12.s\n"
       "uaddv d13, p0, z13.s\n"
       "mov z12.s, z12.s[0]\n"
       "mov z13.s, z13.s[0]\n"
-      "neg z3.s, p2/M, z3.s\n"
-      "mul z11.s, p2/M, z11.s, z3.s\n"
-      "mul z12.s, p2/M, z12.s, z3.s\n"
-      "mul z13.s, p2/M, z13.s, z3.s\n"
+      "neg z28.s, p2/M, z28.s\n"
+      "mul z11.s, p2/M, z11.s, z28.s\n"
+      "mul z12.s, p2/M, z12.s, z28.s\n"
+      "mul z13.s, p2/M, z13.s, z28.s\n"
       "40:"  // Height 3: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
       "add z17.s, z17.s, z11.s\n"
       "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z31.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z29.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z20.s, z20.s, z12.s\n"
       "add z21.s, z21.s, z12.s\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "add z22.s, z22.s, z12.s\n"
       "add z23.s, z23.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add z24.s, z24.s, z13.s\n"
       "add z25.s, z25.s, z13.s\n"
@@ -855,133 +855,133 @@
       "add z26.s, z26.s, z13.s\n"
       "add z27.s, z27.s, z13.s\n"
       "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
+      "add z17.s, z17.s, z31.s\n"
+      "add z18.s, z18.s, z30.s\n"
+      "add z19.s, z19.s, z29.s\n"
       "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
+      "add z21.s, z21.s, z31.s\n"
+      "add z22.s, z22.s, z30.s\n"
+      "add z23.s, z23.s, z29.s\n"
       "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
+      "add z25.s, z25.s, z31.s\n"
       "ld1rw { z0.s }, p2/Z, [x20]\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
-      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
-      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
-      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
-      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      "add z26.s, z26.s, z30.s\n"
+      "add z27.s, z27.s, z29.s\n"
+      ".inst 0x04bc7610  // sqrdmulh z16.s, z16.s, z28.s\n"
+      ".inst 0x04bc7631  // sqrdmulh z17.s, z17.s, z28.s\n"
+      ".inst 0x04bc7652  // sqrdmulh z18.s, z18.s, z28.s\n"
+      ".inst 0x04bc7673  // sqrdmulh z19.s, z19.s, z28.s\n"
+      ".inst 0x04bc7694  // sqrdmulh z20.s, z20.s, z28.s\n"
+      ".inst 0x04bc76b5  // sqrdmulh z21.s, z21.s, z28.s\n"
+      ".inst 0x04bc76d6  // sqrdmulh z22.s, z22.s, z28.s\n"
+      ".inst 0x04bc76f7  // sqrdmulh z23.s, z23.s, z28.s\n"
+      ".inst 0x04bc7718  // sqrdmulh z24.s, z24.s, z28.s\n"
+      ".inst 0x04bc7739  // sqrdmulh z25.s, z25.s, z28.s\n"
+      ".inst 0x04bc775a  // sqrdmulh z26.s, z26.s, z28.s\n"
+      ".inst 0x04bc777b  // sqrdmulh z27.s, z27.s, z28.s\n"
       "tbz %x[flags], #5, 41f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "and z7.d, z19.d, z0.d\n"
-      "and z8.d, z20.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "sqadd z20.s, z20.s, z8.s\n"
-      "and z9.d, z21.d, z0.d\n"
-      "and z10.d, z22.d, z0.d\n"
-      "and z4.d, z23.d, z0.d\n"
-      "and z5.d, z24.d, z0.d\n"
-      "and z6.d, z25.d, z0.d\n"
-      "and z7.d, z26.d, z0.d\n"
-      "and z8.d, z27.d, z0.d\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "sqadd z21.s, z21.s, z9.s\n"
-      "sqadd z22.s, z22.s, z10.s\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "sqadd z24.s, z24.s, z5.s\n"
-      "sqadd z25.s, z25.s, z6.s\n"
-      "sqadd z26.s, z26.s, z7.s\n"
-      "sqadd z27.s, z27.s, z8.s\n"
+      "and z1.d, z16.d, z0.d\n"
+      "and z31.d, z17.d, z0.d\n"
+      "and z30.d, z18.d, z0.d\n"
+      "and z29.d, z19.d, z0.d\n"
+      "and z28.d, z20.d, z0.d\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z1.s\n"
+      "sqadd z17.s, z17.s, z31.s\n"
+      "sqadd z18.s, z18.s, z30.s\n"
+      "sqadd z19.s, z19.s, z29.s\n"
+      "sqadd z20.s, z20.s, z28.s\n"
+      "and z3.d, z21.d, z0.d\n"
+      "and z2.d, z22.d, z0.d\n"
+      "and z1.d, z23.d, z0.d\n"
+      "and z31.d, z24.d, z0.d\n"
+      "and z30.d, z25.d, z0.d\n"
+      "and z29.d, z26.d, z0.d\n"
+      "and z28.d, z27.d, z0.d\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z31.s, z31.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "sqadd z21.s, z21.s, z3.s\n"
+      "sqadd z22.s, z22.s, z2.s\n"
+      "sqadd z23.s, z23.s, z1.s\n"
+      "sqadd z24.s, z24.s, z31.s\n"
+      "sqadd z25.s, z25.s, z30.s\n"
+      "sqadd z26.s, z26.s, z29.s\n"
+      "sqadd z27.s, z27.s, z28.s\n"
       "41:"  // Height 3: no shift correction
       "add x20, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z16.s, z16.s, z28.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z28.s\n"
+      "add z18.s, z18.s, z28.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z20.s, z20.s, z4.s\n"
+      "add z19.s, z19.s, z28.s\n"
+      "add z20.s, z20.s, z28.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
-      "add z21.s, z21.s, z4.s\n"
-      "add z22.s, z22.s, z4.s\n"
+      "add z21.s, z21.s, z28.s\n"
+      "add z22.s, z22.s, z28.s\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      "add z23.s, z23.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
+      "add z23.s, z23.s, z28.s\n"
+      "add z24.s, z24.s, z28.s\n"
       ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
       ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "add z26.s, z26.s, z4.s\n"
+      "add z25.s, z25.s, z28.s\n"
+      "add z26.s, z26.s, z28.s\n"
       ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
       "add x20, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x20]\n"
-      "add z27.s, z27.s, z4.s\n"
+      "ld1rw { z29.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z28.s\n"
       "add x20, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z29.s\n"
+      "smin z17.s, p2/M, z17.s, z29.s\n"
+      "smin z18.s, p2/M, z18.s, z29.s\n"
+      "smin z19.s, p2/M, z19.s, z29.s\n"
+      "smin z20.s, p2/M, z20.s, z29.s\n"
+      "smin z21.s, p2/M, z21.s, z29.s\n"
+      "smin z22.s, p2/M, z22.s, z29.s\n"
+      "smin z23.s, p2/M, z23.s, z29.s\n"
+      "smin z24.s, p2/M, z24.s, z29.s\n"
+      "smin z25.s, p2/M, z25.s, z29.s\n"
+      "smin z26.s, p2/M, z26.s, z29.s\n"
+      "smin z27.s, p2/M, z27.s, z29.s\n"
+      "smax z16.s, p2/M, z16.s, z28.s\n"
+      "smax z17.s, p2/M, z17.s, z28.s\n"
+      "smax z18.s, p2/M, z18.s, z28.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z28.s\n"
+      "smax z20.s, p2/M, z20.s, z28.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z28.s\n"
+      "smax z22.s, p2/M, z22.s, z28.s\n"
       "uzp1 z20.h, z20.h, z21.h\n"
       "st1b { z16.b }, p1, [x27]\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z28.s\n"
+      "smax z24.s, p2/M, z24.s, z28.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z28.s\n"
+      "smax z26.s, p2/M, z26.s, z28.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
       "st1b { z20.b }, p1, [x23]\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
+      "smax z27.s, p2/M, z27.s, z28.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
       "st1b { z24.b }, p1, [x22]\n"
       "addvl x27, x27, #1\n"
       "42:"  // Height 3: Writeback done
@@ -1027,14 +1027,14 @@
       "46:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 47f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
       "cbnz x26, 48f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -1044,9 +1044,9 @@
       "b 48f\n"
       "47:"  // Height 4: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "48:"  // Height 4: input setup done
       "cmp x25, #0x10\n"
       "ble 51f\n"
@@ -1059,88 +1059,88 @@
       "ld1rqb { z3.b }, p0/Z, [x21]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z24.s, z4.b, z2.b[0]\n"
-      "udot z28.s, z4.b, z3.b[0]\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "udot z21.s, z5.b, z1.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z5.b, z0.b[0]\n"
+      "udot z20.s, z5.b, z1.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z24.s, z5.b, z2.b[0]\n"
+      "udot z28.s, z5.b, z3.b[0]\n"
+      "udot z17.s, z4.b, z0.b[0]\n"
+      "udot z21.s, z4.b, z1.b[0]\n"
       "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "udot z25.s, z5.b, z2.b[0]\n"
-      "udot z29.s, z5.b, z3.b[0]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "udot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "udot z25.s, z4.b, z2.b[0]\n"
+      "udot z29.s, z4.b, z3.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "udot z18.s, z10.b, z0.b[0]\n"
+      "udot z22.s, z10.b, z1.b[0]\n"
       "addvl x28, x28, #16\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      "udot z26.s, z6.b, z2.b[0]\n"
-      "udot z30.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      "udot z26.s, z10.b, z2.b[0]\n"
+      "udot z30.s, z10.b, z3.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
-      "udot z23.s, z7.b, z1.b[0]\n"
-      "udot z27.s, z7.b, z2.b[0]\n"
-      "udot z31.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "udot z19.s, z9.b, z0.b[0]\n"
+      "udot z23.s, z9.b, z1.b[0]\n"
+      "udot z27.s, z9.b, z2.b[0]\n"
+      "udot z31.s, z9.b, z3.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
       "udot z16.s, z8.b, z0.b[1]\n"
       "udot z20.s, z8.b, z1.b[1]\n"
       "udot z24.s, z8.b, z2.b[1]\n"
       "udot z28.s, z8.b, z3.b[1]\n"
       "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
-      "udot z21.s, z9.b, z1.b[1]\n"
-      "udot z25.s, z9.b, z2.b[1]\n"
-      "udot z29.s, z9.b, z3.b[1]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
-      "udot z22.s, z10.b, z1.b[1]\n"
-      "udot z26.s, z10.b, z2.b[1]\n"
-      "udot z30.s, z10.b, z3.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      "udot z19.s, z4.b, z0.b[1]\n"
-      "udot z23.s, z4.b, z1.b[1]\n"
-      "udot z27.s, z4.b, z2.b[1]\n"
-      "udot z31.s, z4.b, z3.b[1]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "udot z20.s, z5.b, z1.b[2]\n"
-      "udot z24.s, z5.b, z2.b[2]\n"
-      "udot z28.s, z5.b, z3.b[2]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      "udot z17.s, z6.b, z0.b[2]\n"
-      "udot z21.s, z6.b, z1.b[2]\n"
-      "udot z25.s, z6.b, z2.b[2]\n"
-      "udot z29.s, z6.b, z3.b[2]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
-      "udot z22.s, z7.b, z1.b[2]\n"
-      "udot z26.s, z7.b, z2.b[2]\n"
-      "udot z30.s, z7.b, z3.b[2]\n"
+      "udot z17.s, z7.b, z0.b[1]\n"
+      "udot z21.s, z7.b, z1.b[1]\n"
+      "udot z25.s, z7.b, z2.b[1]\n"
+      "udot z29.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "udot z18.s, z6.b, z0.b[1]\n"
+      "udot z22.s, z6.b, z1.b[1]\n"
+      "udot z26.s, z6.b, z2.b[1]\n"
+      "udot z30.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "udot z19.s, z5.b, z0.b[1]\n"
+      "udot z23.s, z5.b, z1.b[1]\n"
+      "udot z27.s, z5.b, z2.b[1]\n"
+      "udot z31.s, z5.b, z3.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "udot z16.s, z4.b, z0.b[2]\n"
+      "udot z20.s, z4.b, z1.b[2]\n"
+      "udot z24.s, z4.b, z2.b[2]\n"
+      "udot z28.s, z4.b, z3.b[2]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      "udot z17.s, z10.b, z0.b[2]\n"
+      "udot z21.s, z10.b, z1.b[2]\n"
+      "udot z25.s, z10.b, z2.b[2]\n"
+      "udot z29.s, z10.b, z3.b[2]\n"
+      "udot z18.s, z9.b, z0.b[2]\n"
+      "udot z22.s, z9.b, z1.b[2]\n"
+      "udot z26.s, z9.b, z2.b[2]\n"
+      "udot z30.s, z9.b, z3.b[2]\n"
       "udot z19.s, z8.b, z0.b[2]\n"
       "udot z23.s, z8.b, z1.b[2]\n"
       "udot z27.s, z8.b, z2.b[2]\n"
       "udot z31.s, z8.b, z3.b[2]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "udot z20.s, z9.b, z1.b[3]\n"
-      "udot z24.s, z9.b, z2.b[3]\n"
-      "udot z28.s, z9.b, z3.b[3]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "udot z21.s, z10.b, z1.b[3]\n"
-      "udot z25.s, z10.b, z2.b[3]\n"
-      "udot z29.s, z10.b, z3.b[3]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z22.s, z4.b, z1.b[3]\n"
-      "udot z26.s, z4.b, z2.b[3]\n"
-      "udot z30.s, z4.b, z3.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
-      "udot z23.s, z5.b, z1.b[3]\n"
-      "udot z27.s, z5.b, z2.b[3]\n"
-      "udot z31.s, z5.b, z3.b[3]\n"
+      "udot z16.s, z7.b, z0.b[3]\n"
+      "udot z20.s, z7.b, z1.b[3]\n"
+      "udot z24.s, z7.b, z2.b[3]\n"
+      "udot z28.s, z7.b, z3.b[3]\n"
+      "udot z17.s, z6.b, z0.b[3]\n"
+      "udot z21.s, z6.b, z1.b[3]\n"
+      "udot z25.s, z6.b, z2.b[3]\n"
+      "udot z29.s, z6.b, z3.b[3]\n"
+      "udot z18.s, z5.b, z0.b[3]\n"
+      "udot z22.s, z5.b, z1.b[3]\n"
+      "udot z26.s, z5.b, z2.b[3]\n"
+      "udot z30.s, z5.b, z3.b[3]\n"
+      "udot z19.s, z4.b, z0.b[3]\n"
+      "udot z23.s, z4.b, z1.b[3]\n"
+      "udot z27.s, z4.b, z2.b[3]\n"
+      "udot z31.s, z4.b, z3.b[3]\n"
       "tbnz %x[flags], #31, 50f\n"
       "udot z11.s, z0.b, z15.b\n"
       "udot z12.s, z1.b, z15.b\n"
@@ -1157,95 +1157,95 @@
       "subs x25, x25, #0x4\n"
       "ld1rqb { z2.b }, p0/Z, [x22]\n"
       "ld1rqb { z3.b }, p0/Z, [x21]\n"
-      "ld1b { z4.b }, p2/Z, [x28]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "udot z16.s, z4.b, z0.b[0]\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z24.s, z4.b, z2.b[0]\n"
-      "udot z28.s, z4.b, z3.b[0]\n"
-      "udot z17.s, z5.b, z0.b[0]\n"
-      "udot z21.s, z5.b, z1.b[0]\n"
-      "addvl x28, x28, #4\n"
-      "udot z25.s, z5.b, z2.b[0]\n"
-      "udot z29.s, z5.b, z3.b[0]\n"
-      "udot z18.s, z6.b, z0.b[0]\n"
-      "udot z22.s, z6.b, z1.b[0]\n"
-      "udot z26.s, z6.b, z2.b[0]\n"
-      "udot z30.s, z6.b, z3.b[0]\n"
-      "udot z19.s, z7.b, z0.b[0]\n"
-      "udot z23.s, z7.b, z1.b[0]\n"
-      "udot z27.s, z7.b, z2.b[0]\n"
-      "udot z31.s, z7.b, z3.b[0]\n"
-      "ble 52f\n"
-      "ld1b { z8.b }, p2/Z, [x28]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "subs x25, x25, #0x4\n"
-      "udot z16.s, z8.b, z0.b[1]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z7.b, z0.b[0]\n"
+      "udot z20.s, z7.b, z1.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
       "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z20.s, z8.b, z1.b[1]\n"
-      "udot z24.s, z8.b, z2.b[1]\n"
-      "udot z28.s, z8.b, z3.b[1]\n"
-      "udot z17.s, z9.b, z0.b[1]\n"
+      "udot z24.s, z7.b, z2.b[0]\n"
+      "udot z28.s, z7.b, z3.b[0]\n"
+      "udot z17.s, z6.b, z0.b[0]\n"
+      "udot z21.s, z6.b, z1.b[0]\n"
       "addvl x28, x28, #4\n"
-      "udot z21.s, z9.b, z1.b[1]\n"
-      "udot z25.s, z9.b, z2.b[1]\n"
-      "udot z29.s, z9.b, z3.b[1]\n"
-      "udot z18.s, z10.b, z0.b[1]\n"
-      "udot z22.s, z10.b, z1.b[1]\n"
-      "udot z26.s, z10.b, z2.b[1]\n"
-      "udot z30.s, z10.b, z3.b[1]\n"
+      "udot z25.s, z6.b, z2.b[0]\n"
+      "udot z29.s, z6.b, z3.b[0]\n"
+      "udot z18.s, z5.b, z0.b[0]\n"
+      "udot z22.s, z5.b, z1.b[0]\n"
+      "udot z26.s, z5.b, z2.b[0]\n"
+      "udot z30.s, z5.b, z3.b[0]\n"
+      "udot z19.s, z4.b, z0.b[0]\n"
+      "udot z23.s, z4.b, z1.b[0]\n"
+      "udot z27.s, z4.b, z2.b[0]\n"
+      "udot z31.s, z4.b, z3.b[0]\n"
+      "ble 52f\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x25, x25, #0x4\n"
+      "udot z16.s, z7.b, z0.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z20.s, z7.b, z1.b[1]\n"
+      "udot z24.s, z7.b, z2.b[1]\n"
+      "udot z28.s, z7.b, z3.b[1]\n"
+      "udot z17.s, z6.b, z0.b[1]\n"
+      "addvl x28, x28, #4\n"
+      "udot z21.s, z6.b, z1.b[1]\n"
+      "udot z25.s, z6.b, z2.b[1]\n"
+      "udot z29.s, z6.b, z3.b[1]\n"
+      "udot z18.s, z5.b, z0.b[1]\n"
+      "udot z22.s, z5.b, z1.b[1]\n"
+      "udot z26.s, z5.b, z2.b[1]\n"
+      "udot z30.s, z5.b, z3.b[1]\n"
       "udot z19.s, z4.b, z0.b[1]\n"
       "udot z23.s, z4.b, z1.b[1]\n"
       "udot z27.s, z4.b, z2.b[1]\n"
       "udot z31.s, z4.b, z3.b[1]\n"
       "ble 52f\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
       "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x4\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z20.s, z5.b, z1.b[2]\n"
-      "udot z24.s, z5.b, z2.b[2]\n"
-      "udot z28.s, z5.b, z3.b[2]\n"
+      "udot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z20.s, z7.b, z1.b[2]\n"
+      "udot z24.s, z7.b, z2.b[2]\n"
+      "udot z28.s, z7.b, z3.b[2]\n"
       "udot z17.s, z6.b, z0.b[2]\n"
       "addvl x28, x28, #4\n"
       "udot z21.s, z6.b, z1.b[2]\n"
       "udot z25.s, z6.b, z2.b[2]\n"
       "udot z29.s, z6.b, z3.b[2]\n"
-      "udot z18.s, z7.b, z0.b[2]\n"
-      "udot z22.s, z7.b, z1.b[2]\n"
-      "udot z26.s, z7.b, z2.b[2]\n"
-      "udot z30.s, z7.b, z3.b[2]\n"
-      "udot z19.s, z8.b, z0.b[2]\n"
-      "udot z23.s, z8.b, z1.b[2]\n"
-      "udot z27.s, z8.b, z2.b[2]\n"
-      "udot z31.s, z8.b, z3.b[2]\n"
+      "udot z18.s, z5.b, z0.b[2]\n"
+      "udot z22.s, z5.b, z1.b[2]\n"
+      "udot z26.s, z5.b, z2.b[2]\n"
+      "udot z30.s, z5.b, z3.b[2]\n"
+      "udot z19.s, z4.b, z0.b[2]\n"
+      "udot z23.s, z4.b, z1.b[2]\n"
+      "udot z27.s, z4.b, z2.b[2]\n"
+      "udot z31.s, z4.b, z3.b[2]\n"
       "ble 52f\n"
-      "ld1b { z9.b }, p2/Z, [x28]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "udot z16.s, z9.b, z0.b[3]\n"
-      "udot z20.s, z9.b, z1.b[3]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "udot z24.s, z9.b, z2.b[3]\n"
-      "udot z28.s, z9.b, z3.b[3]\n"
-      "udot z17.s, z10.b, z0.b[3]\n"
-      "udot z21.s, z10.b, z1.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "udot z16.s, z7.b, z0.b[3]\n"
+      "udot z20.s, z7.b, z1.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z24.s, z7.b, z2.b[3]\n"
+      "udot z28.s, z7.b, z3.b[3]\n"
+      "udot z17.s, z6.b, z0.b[3]\n"
+      "udot z21.s, z6.b, z1.b[3]\n"
       "addvl x28, x28, #4\n"
-      "udot z25.s, z10.b, z2.b[3]\n"
-      "udot z29.s, z10.b, z3.b[3]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z22.s, z4.b, z1.b[3]\n"
-      "udot z26.s, z4.b, z2.b[3]\n"
-      "udot z30.s, z4.b, z3.b[3]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
-      "udot z23.s, z5.b, z1.b[3]\n"
-      "udot z27.s, z5.b, z2.b[3]\n"
-      "udot z31.s, z5.b, z3.b[3]\n"
+      "udot z25.s, z6.b, z2.b[3]\n"
+      "udot z29.s, z6.b, z3.b[3]\n"
+      "udot z18.s, z5.b, z0.b[3]\n"
+      "udot z22.s, z5.b, z1.b[3]\n"
+      "udot z26.s, z5.b, z2.b[3]\n"
+      "udot z30.s, z5.b, z3.b[3]\n"
+      "udot z19.s, z4.b, z0.b[3]\n"
+      "udot z23.s, z4.b, z1.b[3]\n"
+      "udot z27.s, z4.b, z2.b[3]\n"
+      "udot z31.s, z4.b, z3.b[3]\n"
       "52:"  // Height 4: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 53f\n"
       "udot z11.s, z0.b, z15.b\n"
@@ -1265,7 +1265,7 @@
       "mov x20, #0x4\n"
       "whilelt p0.s, XZR, x20\n"
       "add x20, %x[qp], %[b_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
       "uaddv d11, p0, z11.s\n"
       "mov z11.s, z11.s[0]\n"
       "uaddv d12, p0, z12.s\n"
@@ -1273,28 +1273,28 @@
       "mov z12.s, z12.s[0]\n"
       "mov z13.s, z13.s[0]\n"
       "uaddv d14, p0, z14.s\n"
-      "neg z4.s, p2/M, z4.s\n"
+      "neg z0.s, p2/M, z0.s\n"
       "mov z14.s, z14.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z4.s\n"
-      "mul z12.s, p2/M, z12.s, z4.s\n"
-      "mul z13.s, p2/M, z13.s, z4.s\n"
-      "mul z14.s, p2/M, z14.s, z4.s\n"
+      "mul z11.s, p2/M, z11.s, z0.s\n"
+      "mul z12.s, p2/M, z12.s, z0.s\n"
+      "mul z13.s, p2/M, z13.s, z0.s\n"
+      "mul z14.s, p2/M, z14.s, z0.s\n"
       "54:"  // Height 4: skip row sum fixup
       "add z16.s, z16.s, z11.s\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z4.s }, p2/Z, [x10]\n"
+      "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z20.s, z20.s, z12.s\n"
       "add z21.s, z21.s, z12.s\n"
       "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "add z22.s, z22.s, z12.s\n"
       "add z23.s, z23.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
       "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add z24.s, z24.s, z13.s\n"
       "add z25.s, z25.s, z13.s\n"
@@ -1305,174 +1305,174 @@
       "add z29.s, z29.s, z14.s\n"
       "add z30.s, z30.s, z14.s\n"
       "add z31.s, z31.s, z14.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z20.s, z20.s, z0.s\n"
-      "add z21.s, z21.s, z1.s\n"
-      "add z22.s, z22.s, z2.s\n"
-      "add z23.s, z23.s, z3.s\n"
-      "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
-      "add z28.s, z28.s, z0.s\n"
-      "add z29.s, z29.s, z1.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z0.s\n"
+      "add z18.s, z18.s, z3.s\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "add z21.s, z21.s, z0.s\n"
+      "add z22.s, z22.s, z3.s\n"
+      "add z23.s, z23.s, z2.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z0.s\n"
+      "add z26.s, z26.s, z3.s\n"
+      "add z27.s, z27.s, z2.s\n"
+      "add z28.s, z28.s, z4.s\n"
+      "add z29.s, z29.s, z0.s\n"
       "ld1rw { z0.s }, p2/Z, [x20]\n"
-      "add z30.s, z30.s, z2.s\n"
-      "add z31.s, z31.s, z3.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
-      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
-      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
-      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
-      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
-      ".inst 0x04a4779c  // sqrdmulh z28.s, z28.s, z4.s\n"
-      ".inst 0x04a477bd  // sqrdmulh z29.s, z29.s, z4.s\n"
-      ".inst 0x04a477de  // sqrdmulh z30.s, z30.s, z4.s\n"
-      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      "add z30.s, z30.s, z3.s\n"
+      "add z31.s, z31.s, z2.s\n"
+      ".inst 0x04a17610  // sqrdmulh z16.s, z16.s, z1.s\n"
+      ".inst 0x04a17631  // sqrdmulh z17.s, z17.s, z1.s\n"
+      ".inst 0x04a17652  // sqrdmulh z18.s, z18.s, z1.s\n"
+      ".inst 0x04a17673  // sqrdmulh z19.s, z19.s, z1.s\n"
+      ".inst 0x04a17694  // sqrdmulh z20.s, z20.s, z1.s\n"
+      ".inst 0x04a176b5  // sqrdmulh z21.s, z21.s, z1.s\n"
+      ".inst 0x04a176d6  // sqrdmulh z22.s, z22.s, z1.s\n"
+      ".inst 0x04a176f7  // sqrdmulh z23.s, z23.s, z1.s\n"
+      ".inst 0x04a17718  // sqrdmulh z24.s, z24.s, z1.s\n"
+      ".inst 0x04a17739  // sqrdmulh z25.s, z25.s, z1.s\n"
+      ".inst 0x04a1775a  // sqrdmulh z26.s, z26.s, z1.s\n"
+      ".inst 0x04a1777b  // sqrdmulh z27.s, z27.s, z1.s\n"
+      ".inst 0x04a1779c  // sqrdmulh z28.s, z28.s, z1.s\n"
+      ".inst 0x04a177bd  // sqrdmulh z29.s, z29.s, z1.s\n"
+      ".inst 0x04a177de  // sqrdmulh z30.s, z30.s, z1.s\n"
+      ".inst 0x04a177ff  // sqrdmulh z31.s, z31.s, z1.s\n"
       "tbz %x[flags], #5, 55f\n"
-      "and z4.d, z16.d, z0.d\n"
-      "and z5.d, z17.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "and z6.d, z18.d, z0.d\n"
-      "and z7.d, z19.d, z0.d\n"
-      "and z8.d, z20.d, z0.d\n"
-      "and z9.d, z21.d, z0.d\n"
-      "and z10.d, z22.d, z0.d\n"
-      "and z4.d, z23.d, z0.d\n"
-      "and z5.d, z24.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
+      "and z2.d, z16.d, z0.d\n"
+      "and z1.d, z17.d, z0.d\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z2.s\n"
+      "sqadd z17.s, z17.s, z1.s\n"
+      "and z7.d, z18.d, z0.d\n"
+      "and z6.d, z19.d, z0.d\n"
+      "and z5.d, z20.d, z0.d\n"
+      "and z4.d, z21.d, z0.d\n"
+      "and z3.d, z22.d, z0.d\n"
+      "and z2.d, z23.d, z0.d\n"
+      "and z1.d, z24.d, z0.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
-      "sqadd z20.s, z20.s, z8.s\n"
-      "sqadd z21.s, z21.s, z9.s\n"
-      "sqadd z22.s, z22.s, z10.s\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "sqadd z24.s, z24.s, z5.s\n"
-      "and z6.d, z25.d, z0.d\n"
-      "and z7.d, z26.d, z0.d\n"
-      "and z8.d, z27.d, z0.d\n"
-      "and z9.d, z28.d, z0.d\n"
-      "and z10.d, z29.d, z0.d\n"
-      "and z4.d, z30.d, z0.d\n"
-      "and z5.d, z31.d, z0.d\n"
       "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z25.s, z25.s, z6.s\n"
-      "sqadd z26.s, z26.s, z7.s\n"
-      "sqadd z27.s, z27.s, z8.s\n"
-      "sqadd z28.s, z28.s, z9.s\n"
-      "sqadd z29.s, z29.s, z10.s\n"
-      "sqadd z30.s, z30.s, z4.s\n"
-      "sqadd z31.s, z31.s, z5.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z7.s\n"
+      "sqadd z19.s, z19.s, z6.s\n"
+      "sqadd z20.s, z20.s, z5.s\n"
+      "sqadd z21.s, z21.s, z4.s\n"
+      "sqadd z22.s, z22.s, z3.s\n"
+      "sqadd z23.s, z23.s, z2.s\n"
+      "sqadd z24.s, z24.s, z1.s\n"
+      "and z7.d, z25.d, z0.d\n"
+      "and z6.d, z26.d, z0.d\n"
+      "and z5.d, z27.d, z0.d\n"
+      "and z4.d, z28.d, z0.d\n"
+      "and z3.d, z29.d, z0.d\n"
+      "and z2.d, z30.d, z0.d\n"
+      "and z1.d, z31.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z7.s\n"
+      "sqadd z26.s, z26.s, z6.s\n"
+      "sqadd z27.s, z27.s, z5.s\n"
+      "sqadd z28.s, z28.s, z4.s\n"
+      "sqadd z29.s, z29.s, z3.s\n"
+      "sqadd z30.s, z30.s, z2.s\n"
+      "sqadd z31.s, z31.s, z1.s\n"
       "55:"  // Height 4: no shift correction
       "add x20, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x20]\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z16.s, z16.s, z2.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z2.s\n"
+      "add z18.s, z18.s, z2.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z20.s, z20.s, z4.s\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z20.s, z20.s, z2.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
-      "add z21.s, z21.s, z4.s\n"
-      "add z22.s, z22.s, z4.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z2.s\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      "add z23.s, z23.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
+      "add z23.s, z23.s, z2.s\n"
+      "add z24.s, z24.s, z2.s\n"
       ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
       ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "add z26.s, z26.s, z4.s\n"
+      "add z25.s, z25.s, z2.s\n"
+      "add z26.s, z26.s, z2.s\n"
       ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
       ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
-      "add z27.s, z27.s, z4.s\n"
-      "add z28.s, z28.s, z4.s\n"
+      "add z27.s, z27.s, z2.s\n"
+      "add z28.s, z28.s, z2.s\n"
       ".inst 0x4482881d  // srshl z29.s, p2/M, z29.s, z0.s\n"
       ".inst 0x4482881e  // srshl z30.s, p2/M, z30.s, z0.s\n"
-      "add z29.s, z29.s, z4.s\n"
-      "add z30.s, z30.s, z4.s\n"
+      "add z29.s, z29.s, z2.s\n"
+      "add z30.s, z30.s, z2.s\n"
       ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
       "add x20, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x20]\n"
-      "add z31.s, z31.s, z4.s\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add z31.s, z31.s, z2.s\n"
       "add x20, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x20]\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smin z28.s, p2/M, z28.s, z6.s\n"
-      "smin z29.s, p2/M, z29.s, z6.s\n"
-      "smin z30.s, p2/M, z30.s, z6.s\n"
-      "smin z31.s, p2/M, z31.s, z6.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "smin z16.s, p2/M, z16.s, z1.s\n"
+      "smin z17.s, p2/M, z17.s, z1.s\n"
+      "smin z18.s, p2/M, z18.s, z1.s\n"
+      "smin z19.s, p2/M, z19.s, z1.s\n"
+      "smin z20.s, p2/M, z20.s, z1.s\n"
+      "smin z21.s, p2/M, z21.s, z1.s\n"
+      "smin z22.s, p2/M, z22.s, z1.s\n"
+      "smin z23.s, p2/M, z23.s, z1.s\n"
+      "smin z24.s, p2/M, z24.s, z1.s\n"
+      "smin z25.s, p2/M, z25.s, z1.s\n"
+      "smin z26.s, p2/M, z26.s, z1.s\n"
+      "smin z27.s, p2/M, z27.s, z1.s\n"
+      "smin z28.s, p2/M, z28.s, z1.s\n"
+      "smin z29.s, p2/M, z29.s, z1.s\n"
+      "smin z30.s, p2/M, z30.s, z1.s\n"
+      "smin z31.s, p2/M, z31.s, z1.s\n"
+      "smax z16.s, p2/M, z16.s, z0.s\n"
+      "smax z17.s, p2/M, z17.s, z0.s\n"
+      "smax z18.s, p2/M, z18.s, z0.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z0.s\n"
+      "smax z20.s, p2/M, z20.s, z0.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z0.s\n"
+      "smax z22.s, p2/M, z22.s, z0.s\n"
       "uzp1 z20.h, z20.h, z21.h\n"
       "st1b { z16.b }, p1, [x27]\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "uzp1 z21.h, z22.h, z23.h\n"
-      "uzp1 z20.b, z20.b, z21.b\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z0.s\n"
+      "smax z24.s, p2/M, z24.s, z0.s\n"
+      "uzp1 z16.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z0.s\n"
+      "smax z26.s, p2/M, z26.s, z0.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
       "st1b { z20.b }, p1, [x23]\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "smax z28.s, p2/M, z28.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "smax z29.s, p2/M, z29.s, z5.s\n"
-      "smax z30.s, p2/M, z30.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z0.s\n"
+      "smax z28.s, p2/M, z28.s, z0.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "smax z29.s, p2/M, z29.s, z0.s\n"
+      "smax z30.s, p2/M, z30.s, z0.s\n"
       "uzp1 z28.h, z28.h, z29.h\n"
       "st1b { z24.b }, p1, [x22]\n"
-      "smax z31.s, p2/M, z31.s, z5.s\n"
-      "uzp1 z29.h, z30.h, z31.h\n"
-      "uzp1 z28.b, z28.b, z29.b\n"
+      "smax z31.s, p2/M, z31.s, z0.s\n"
+      "uzp1 z16.h, z30.h, z31.h\n"
+      "uzp1 z28.b, z28.b, z16.b\n"
       "st1b { z28.b }, p1, [x21]\n"
       "addvl x27, x27, #1\n"
       "56:"  // Height 4: Writeback done
@@ -1491,7 +1491,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "58:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1499,4 +1498,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp
index da27554..5de68cc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -74,7 +74,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, uint8_t>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -97,5 +96,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp
index f9d38c2..69894be 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp
@@ -108,11 +108,11 @@
       "4:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 5f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
       "cbnz x26, 6f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -125,41 +125,41 @@
       "7:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x25\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
-      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
-      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "trn1 z0.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45d89810  // ummla z16.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "trn2 z1.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45d99814  // ummla z20.s, z0.b, z25.b\n"
+      ".inst 0x45d89811  // ummla z17.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45da9815  // ummla z21.s, z0.b, z26.b\n"
+      ".inst 0x45d99812  // ummla z18.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
-      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
-      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
-      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
-      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
-      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
+      ".inst 0x45d89816  // ummla z22.s, z0.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45da9813  // ummla z19.s, z0.b, z26.b\n"
+      ".inst 0x45d99817  // ummla z23.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45d89830  // ummla z16.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45da9834  // ummla z20.s, z1.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45d99831  // ummla z17.s, z1.b, z25.b\n"
+      ".inst 0x45d89835  // ummla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x45db9832  // ummla z18.s, z1.b, z27.b\n"
+      ".inst 0x45da9836  // ummla z22.s, z1.b, z26.b\n"
+      ".inst 0x45d99833  // ummla z19.s, z1.b, z25.b\n"
+      ".inst 0x45d89837  // ummla z23.s, z1.b, z24.b\n"
       "add x24, x24, #0x10\n"
       "tbnz %x[flags], #31, 8f\n"
       "udot z11.s, z0.b, z15.b\n"
@@ -171,43 +171,43 @@
       "9:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x25\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "trn1 z0.d, z1.d, z27.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45d89810  // ummla z16.s, z0.b, z24.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x8\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
-      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
-      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
-      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
-      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "trn2 z1.d, z1.d, z27.d\n"
+      ".inst 0x45da9814  // ummla z20.s, z0.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45d99811  // ummla z17.s, z0.b, z25.b\n"
+      ".inst 0x45d89815  // ummla z21.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45db9812  // ummla z18.s, z0.b, z27.b\n"
+      ".inst 0x45da9816  // ummla z22.s, z0.b, z26.b\n"
+      ".inst 0x45d99813  // ummla z19.s, z0.b, z25.b\n"
+      ".inst 0x45d89817  // ummla z23.s, z0.b, z24.b\n"
       "addvl x28, x28, #8\n"
       "ble 10f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
-      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
-      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
-      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
-      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
-      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45d89830  // ummla z16.s, z1.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45d89834  // ummla z20.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45d99831  // ummla z17.s, z1.b, z25.b\n"
+      ".inst 0x45d89835  // ummla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45d99832  // ummla z18.s, z1.b, z25.b\n"
+      ".inst 0x45d89836  // ummla z22.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45d99833  // ummla z19.s, z1.b, z25.b\n"
+      ".inst 0x45d89837  // ummla z23.s, z1.b, z24.b\n"
       "addvl x28, x28, #8\n"
       "10:"  // Height 1: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 11f\n"
@@ -224,74 +224,74 @@
       "uzp1 z19.d, z19.d, z23.d\n"
       "mov z23.d, z16.d\n"
       "tbnz %x[flags], #31, 12f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1rw { z1.s }, p2/Z, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
       ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
-      "neg z1.s, p2/M, z1.s\n"
+      "neg z16.s, p2/M, z16.s\n"
       "mov z11.s, z11.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z1.s\n"
+      "mul z11.s, p2/M, z11.s, z16.s\n"
       "12:"  // Height 1: skip row sum fixup
       "add z23.s, z23.s, z11.s\n"
       "add z17.s, z17.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x10]\n"
+      "ld1w { z21.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z18.s, z18.s, z11.s\n"
       "add z19.s, z19.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
+      "ld1w { z20.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
-      "add z23.s, z23.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      "add z23.s, z23.s, z22.s\n"
+      "add z17.s, z17.s, z21.s\n"
+      "add z18.s, z18.s, z20.s\n"
+      "add z19.s, z19.s, z16.s\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      ".inst 0x04b076f7  // sqrdmulh z23.s, z23.s, z16.s\n"
+      ".inst 0x04b07631  // sqrdmulh z17.s, z17.s, z16.s\n"
       "addvl x10, x10, #4\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04b07652  // sqrdmulh z18.s, z18.s, z16.s\n"
+      ".inst 0x04b07673  // sqrdmulh z19.s, z19.s, z16.s\n"
       "tbz %x[flags], #5, 13f\n"
-      "and z4.d, z23.d, z0.d\n"
-      "and z5.d, z17.d, z0.d\n"
-      "and z6.d, z18.d, z0.d\n"
-      "and z7.d, z19.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "sqadd z19.s, z19.s, z7.s\n"
+      "and z22.d, z23.d, z0.d\n"
+      "and z21.d, z17.d, z0.d\n"
+      "and z20.d, z18.d, z0.d\n"
+      "and z16.d, z19.d, z0.d\n"
+      "asr z22.s, z22.s, #0x1f\n"
+      "asr z21.s, z21.s, #0x1f\n"
+      "asr z20.s, z20.s, #0x1f\n"
+      "asr z16.s, z16.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z22.s\n"
+      "sqadd z17.s, z17.s, z21.s\n"
+      "sqadd z18.s, z18.s, z20.s\n"
+      "sqadd z19.s, z19.s, z16.s\n"
       "13:"  // Height 1: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
-      "add z23.s, z23.s, z4.s\n"
+      "add z23.s, z23.s, z16.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z16.s\n"
+      "add z18.s, z18.s, z16.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x23]\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x23]\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z20.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z16.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z16.s }, p2/Z, [x20]\n"
+      "smin z23.s, p2/M, z23.s, z20.s\n"
+      "smin z17.s, p2/M, z17.s, z20.s\n"
+      "smin z18.s, p2/M, z18.s, z20.s\n"
+      "smin z19.s, p2/M, z19.s, z20.s\n"
+      "smax z23.s, p2/M, z23.s, z16.s\n"
+      "smax z17.s, p2/M, z17.s, z16.s\n"
+      "smax z18.s, p2/M, z18.s, z16.s\n"
       "uzp1 z23.h, z23.h, z17.h\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "uzp1 z17.h, z18.h, z19.h\n"
-      "uzp1 z23.b, z23.b, z17.b\n"
+      "smax z19.s, p2/M, z19.s, z16.s\n"
+      "uzp1 z16.h, z18.h, z19.h\n"
+      "uzp1 z23.b, z23.b, z16.b\n"
       "st1b { z23.b }, p1, [x27]\n"
       "addvl x27, x27, #1\n"
       "14:"  // Height 1: Writeback done
@@ -324,12 +324,12 @@
       "18:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 19f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
       "cbnz x26, 20f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -337,49 +337,49 @@
       "b 20f\n"
       "19:"  // Height 2: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
+      "add x23, x24, x21\n"
       "20:"  // Height 2: input setup done
       "cmp x25, #0x10\n"
       "ble 23f\n"
       "21:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x25\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
-      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
-      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "ld1rqb { z26.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45d89810  // ummla z16.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "trn2 z1.d, z1.d, z26.d\n"
+      "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45d99814  // ummla z20.s, z0.b, z25.b\n"
+      ".inst 0x45d89811  // ummla z17.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45da9815  // ummla z21.s, z0.b, z26.b\n"
+      ".inst 0x45d99812  // ummla z18.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
-      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
-      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
-      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
-      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
-      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
-      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
-      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
+      ".inst 0x45d89816  // ummla z22.s, z0.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45da9813  // ummla z19.s, z0.b, z26.b\n"
+      ".inst 0x45d99817  // ummla z23.s, z0.b, z25.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45d89830  // ummla z16.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45da9834  // ummla z20.s, z1.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45d99831  // ummla z17.s, z1.b, z25.b\n"
+      ".inst 0x45d89835  // ummla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x45db9832  // ummla z18.s, z1.b, z27.b\n"
+      ".inst 0x45da9836  // ummla z22.s, z1.b, z26.b\n"
+      ".inst 0x45d99833  // ummla z19.s, z1.b, z25.b\n"
+      ".inst 0x45d89837  // ummla z23.s, z1.b, z24.b\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
       "tbnz %x[flags], #31, 22f\n"
@@ -392,44 +392,44 @@
       "23:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x25\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z27.d\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45d89810  // ummla z16.s, z0.b, z24.b\n"
+      "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n"
       "subs x25, x25, #0x8\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
-      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
-      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
-      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
-      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "trn2 z1.d, z1.d, z27.d\n"
+      ".inst 0x45da9814  // ummla z20.s, z0.b, z26.b\n"
+      "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45d99811  // ummla z17.s, z0.b, z25.b\n"
+      ".inst 0x45d89815  // ummla z21.s, z0.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45db9812  // ummla z18.s, z0.b, z27.b\n"
+      ".inst 0x45da9816  // ummla z22.s, z0.b, z26.b\n"
+      ".inst 0x45d99813  // ummla z19.s, z0.b, z25.b\n"
+      ".inst 0x45d89817  // ummla z23.s, z0.b, z24.b\n"
       "addvl x28, x28, #8\n"
       "ble 24f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
-      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
-      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
-      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
-      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
-      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
+      "ld1b { z24.b }, p2/Z, [x28]\n"
+      ".inst 0x45d89830  // ummla z16.s, z1.b, z24.b\n"
+      "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45d89834  // ummla z20.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45d99831  // ummla z17.s, z1.b, z25.b\n"
+      ".inst 0x45d89835  // ummla z21.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45d99832  // ummla z18.s, z1.b, z25.b\n"
+      ".inst 0x45d89836  // ummla z22.s, z1.b, z24.b\n"
+      "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45d99833  // ummla z19.s, z1.b, z25.b\n"
+      ".inst 0x45d89837  // ummla z23.s, z1.b, z24.b\n"
       "addvl x28, x28, #8\n"
       "24:"  // Height 2: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 25f\n"
@@ -440,133 +440,133 @@
       "add x26, x26, #0x1\n"
       "cmp x26, x20\n"
       "bne 18b\n"
-      "uzp1 z7.d, z16.d, z20.d\n"
+      "uzp1 z24.d, z16.d, z20.d\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "uzp2 z16.d, z16.d, z20.d\n"
-      "add x22, x27, x20\n"
+      "add x23, x27, x20\n"
       "uzp1 z20.d, z17.d, z21.d\n"
       "uzp2 z17.d, z17.d, z21.d\n"
       "uzp1 z21.d, z18.d, z22.d\n"
       "uzp2 z18.d, z18.d, z22.d\n"
       "uzp1 z22.d, z19.d, z23.d\n"
       "uzp2 z19.d, z19.d, z23.d\n"
-      "mov z23.d, z7.d\n"
+      "mov z23.d, z24.d\n"
       "tbnz %x[flags], #31, 26f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1rw { z2.s }, p2/Z, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
       ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
-      "neg z2.s, p2/M, z2.s\n"
+      "neg z24.s, p2/M, z24.s\n"
       "mov z12.s, z11.s[3]\n"
       "mov z11.s, z11.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z2.s\n"
-      "mul z12.s, p2/M, z12.s, z2.s\n"
+      "mul z11.s, p2/M, z11.s, z24.s\n"
+      "mul z12.s, p2/M, z12.s, z24.s\n"
       "26:"  // Height 2: skip row sum fixup
       "add z23.s, z23.s, z11.s\n"
       "add z20.s, z20.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x10]\n"
+      "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z21.s, z21.s, z11.s\n"
       "add z22.s, z22.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z16.s, z16.s, z12.s\n"
       "add z17.s, z17.s, z12.s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "add z18.s, z18.s, z12.s\n"
       "add z19.s, z19.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
-      "add z23.s, z23.s, z0.s\n"
-      "add z20.s, z20.s, z1.s\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z23.s, z23.s, z28.s\n"
+      "add z20.s, z20.s, z27.s\n"
       "addvl x10, x10, #4\n"
-      "add z21.s, z21.s, z2.s\n"
-      "add z22.s, z22.s, z3.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      "add z21.s, z21.s, z26.s\n"
+      "add z22.s, z22.s, z25.s\n"
+      "add z16.s, z16.s, z28.s\n"
+      "add z17.s, z17.s, z27.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z18.s, z18.s, z26.s\n"
+      "add z19.s, z19.s, z25.s\n"
+      ".inst 0x04b876f7  // sqrdmulh z23.s, z23.s, z24.s\n"
+      ".inst 0x04b87694  // sqrdmulh z20.s, z20.s, z24.s\n"
+      ".inst 0x04b876b5  // sqrdmulh z21.s, z21.s, z24.s\n"
+      ".inst 0x04b876d6  // sqrdmulh z22.s, z22.s, z24.s\n"
+      ".inst 0x04b87610  // sqrdmulh z16.s, z16.s, z24.s\n"
+      ".inst 0x04b87631  // sqrdmulh z17.s, z17.s, z24.s\n"
+      ".inst 0x04b87652  // sqrdmulh z18.s, z18.s, z24.s\n"
+      ".inst 0x04b87673  // sqrdmulh z19.s, z19.s, z24.s\n"
       "tbz %x[flags], #5, 27f\n"
-      "and z4.d, z23.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z23.s, z23.s, z4.s\n"
-      "and z5.d, z20.d, z0.d\n"
-      "and z6.d, z21.d, z0.d\n"
-      "and z7.d, z22.d, z0.d\n"
-      "and z8.d, z16.d, z0.d\n"
-      "and z9.d, z17.d, z0.d\n"
-      "and z10.d, z18.d, z0.d\n"
-      "and z4.d, z19.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z20.s, z20.s, z5.s\n"
-      "sqadd z21.s, z21.s, z6.s\n"
-      "sqadd z22.s, z22.s, z7.s\n"
-      "sqadd z16.s, z16.s, z8.s\n"
-      "sqadd z17.s, z17.s, z9.s\n"
-      "sqadd z18.s, z18.s, z10.s\n"
-      "sqadd z19.s, z19.s, z4.s\n"
+      "and z24.d, z23.d, z0.d\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z24.s\n"
+      "and z30.d, z20.d, z0.d\n"
+      "and z29.d, z21.d, z0.d\n"
+      "and z28.d, z22.d, z0.d\n"
+      "and z27.d, z16.d, z0.d\n"
+      "and z26.d, z17.d, z0.d\n"
+      "and z25.d, z18.d, z0.d\n"
+      "and z24.d, z19.d, z0.d\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z27.s, z27.s, #0x1f\n"
+      "asr z26.s, z26.s, #0x1f\n"
+      "asr z25.s, z25.s, #0x1f\n"
+      "asr z24.s, z24.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z30.s\n"
+      "sqadd z21.s, z21.s, z29.s\n"
+      "sqadd z22.s, z22.s, z28.s\n"
+      "sqadd z16.s, z16.s, z27.s\n"
+      "sqadd z17.s, z17.s, z26.s\n"
+      "sqadd z18.s, z18.s, z25.s\n"
+      "sqadd z19.s, z19.s, z24.s\n"
       "27:"  // Height 2: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
-      "add z23.s, z23.s, z4.s\n"
+      "add z23.s, z23.s, z24.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "add z21.s, z21.s, z4.s\n"
+      "add z20.s, z20.s, z24.s\n"
+      "add z21.s, z21.s, z24.s\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z22.s, z22.s, z24.s\n"
+      "add z16.s, z16.s, z24.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z24.s\n"
+      "add z18.s, z18.s, z24.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x23]\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x23]\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z25.s }, p2/Z, [x20]\n"
+      "add z19.s, z19.s, z24.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z24.s }, p2/Z, [x20]\n"
+      "smin z23.s, p2/M, z23.s, z25.s\n"
+      "smin z20.s, p2/M, z20.s, z25.s\n"
+      "smin z21.s, p2/M, z21.s, z25.s\n"
+      "smin z22.s, p2/M, z22.s, z25.s\n"
+      "smin z16.s, p2/M, z16.s, z25.s\n"
+      "smin z17.s, p2/M, z17.s, z25.s\n"
+      "smin z18.s, p2/M, z18.s, z25.s\n"
+      "smin z19.s, p2/M, z19.s, z25.s\n"
+      "smax z23.s, p2/M, z23.s, z24.s\n"
+      "smax z20.s, p2/M, z20.s, z24.s\n"
+      "smax z21.s, p2/M, z21.s, z24.s\n"
       "uzp1 z23.h, z23.h, z20.h\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z22.s, p2/M, z22.s, z24.s\n"
+      "smax z16.s, p2/M, z16.s, z24.s\n"
       "uzp1 z20.h, z21.h, z22.h\n"
       "uzp1 z23.b, z23.b, z20.b\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z24.s\n"
+      "smax z18.s, p2/M, z18.s, z24.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
       "st1b { z23.b }, p1, [x27]\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z24.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "st1b { z16.b }, p1, [x22]\n"
+      "st1b { z16.b }, p1, [x23]\n"
       "addvl x27, x27, #1\n"
       "28:"  // Height 2: Writeback done
       "decw x9, ALL, MUL #4\n"
@@ -607,13 +607,13 @@
       "32:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 33f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
       "cbnz x26, 34f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -622,8 +622,8 @@
       "b 34f\n"
       "33:"  // Height 3: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "34:"  // Height 3: input setup done
       "cmp x25, #0x10\n"
       "ble 37f\n"
@@ -634,60 +634,60 @@
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
       "trn1 z0.d, z1.d, z2.d\n"
       "trn2 z1.d, z1.d, z2.d\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      ".inst 0x45c59858  // ummla z24.s, z2.b, z5.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45c6985c  // ummla z28.s, z2.b, z6.b\n"
-      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      "trn1 z2.d, z3.d, z5.d\n"
+      "trn2 z3.d, z3.d, z5.d\n"
+      ".inst 0x45c49810  // ummla z16.s, z0.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45c49858  // ummla z24.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45c59814  // ummla z20.s, z0.b, z5.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c5985c  // ummla z28.s, z2.b, z5.b\n"
+      ".inst 0x45c49811  // ummla z17.s, z0.b, z4.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
       "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      ".inst 0x45c79859  // ummla z25.s, z2.b, z7.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
-      ".inst 0x45c8985d  // ummla z29.s, z2.b, z8.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
-      ".inst 0x45c9985a  // ummla z26.s, z2.b, z9.b\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
-      ".inst 0x45ca985e  // ummla z30.s, z2.b, z10.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x45c49859  // ummla z25.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45c99815  // ummla z21.s, z0.b, z9.b\n"
+      ".inst 0x45c9985d  // ummla z29.s, z2.b, z9.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45c89812  // ummla z18.s, z0.b, z8.b\n"
+      ".inst 0x45c8985a  // ummla z26.s, z2.b, z8.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45c79816  // ummla z22.s, z0.b, z7.b\n"
+      ".inst 0x45c7985e  // ummla z30.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
-      ".inst 0x45c4985b  // ummla z27.s, z2.b, z4.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45c69813  // ummla z19.s, z0.b, z6.b\n"
+      ".inst 0x45c6985b  // ummla z27.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
       "add x23, x23, #0x10\n"
       ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
       ".inst 0x45c5985f  // ummla z31.s, z2.b, z5.b\n"
       "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
-      ".inst 0x45c69878  // ummla z24.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
-      ".inst 0x45c7987c  // ummla z28.s, z3.b, z7.b\n"
-      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
-      ".inst 0x45c89879  // ummla z25.s, z3.b, z8.b\n"
-      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
-      ".inst 0x45c9987d  // ummla z29.s, z3.b, z9.b\n"
-      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
-      ".inst 0x45ca987a  // ummla z26.s, z3.b, z10.b\n"
-      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
-      ".inst 0x45c4987e  // ummla z30.s, z3.b, z4.b\n"
+      ".inst 0x45c49830  // ummla z16.s, z1.b, z4.b\n"
+      ".inst 0x45c49878  // ummla z24.s, z3.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x45ca9834  // ummla z20.s, z1.b, z10.b\n"
+      ".inst 0x45ca987c  // ummla z28.s, z3.b, z10.b\n"
+      ".inst 0x45c99831  // ummla z17.s, z1.b, z9.b\n"
+      ".inst 0x45c99879  // ummla z25.s, z3.b, z9.b\n"
+      ".inst 0x45c89835  // ummla z21.s, z1.b, z8.b\n"
+      ".inst 0x45c8987d  // ummla z29.s, z3.b, z8.b\n"
+      ".inst 0x45c79832  // ummla z18.s, z1.b, z7.b\n"
+      ".inst 0x45c7987a  // ummla z26.s, z3.b, z7.b\n"
+      ".inst 0x45c69836  // ummla z22.s, z1.b, z6.b\n"
+      ".inst 0x45c6987e  // ummla z30.s, z3.b, z6.b\n"
       ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
       ".inst 0x45c5987b  // ummla z27.s, z3.b, z5.b\n"
-      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
-      ".inst 0x45c6987f  // ummla z31.s, z3.b, z6.b\n"
+      ".inst 0x45c49837  // ummla z23.s, z1.b, z4.b\n"
+      ".inst 0x45c4987f  // ummla z31.s, z3.b, z4.b\n"
       "tbnz %x[flags], #31, 36f\n"
       "udot z11.s, z0.b, z15.b\n"
       "udot z13.s, z2.b, z15.b\n"
@@ -708,56 +708,56 @@
       "trn1 z2.d, z3.d, z4.d\n"
       "trn2 z3.d, z3.d, z4.d\n"
       ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
       ".inst 0x45c59858  // ummla z24.s, z2.b, z5.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n"
       "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
       "subs x25, x25, #0x8\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
-      ".inst 0x45c6985c  // ummla z28.s, z2.b, z6.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
-      ".inst 0x45c79859  // ummla z25.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c49814  // ummla z20.s, z0.b, z4.b\n"
+      ".inst 0x45c4985c  // ummla z28.s, z2.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45c99811  // ummla z17.s, z0.b, z9.b\n"
+      ".inst 0x45c99859  // ummla z25.s, z2.b, z9.b\n"
       ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
       ".inst 0x45c8985d  // ummla z29.s, z2.b, z8.b\n"
       "addvl x28, x28, #8\n"
-      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
-      ".inst 0x45c9985a  // ummla z26.s, z2.b, z9.b\n"
-      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
-      ".inst 0x45ca985e  // ummla z30.s, z2.b, z10.b\n"
-      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
-      ".inst 0x45c4985b  // ummla z27.s, z2.b, z4.b\n"
-      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
-      ".inst 0x45c5985f  // ummla z31.s, z2.b, z5.b\n"
+      ".inst 0x45c79812  // ummla z18.s, z0.b, z7.b\n"
+      ".inst 0x45c7985a  // ummla z26.s, z2.b, z7.b\n"
+      ".inst 0x45c69816  // ummla z22.s, z0.b, z6.b\n"
+      ".inst 0x45c6985e  // ummla z30.s, z2.b, z6.b\n"
+      ".inst 0x45c59813  // ummla z19.s, z0.b, z5.b\n"
+      ".inst 0x45c5985b  // ummla z27.s, z2.b, z5.b\n"
+      ".inst 0x45c49817  // ummla z23.s, z0.b, z4.b\n"
+      ".inst 0x45c4985f  // ummla z31.s, z2.b, z4.b\n"
       "ble 38f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
-      ".inst 0x45c69878  // ummla z24.s, z3.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
-      ".inst 0x45c7987c  // ummla z28.s, z3.b, z7.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
-      ".inst 0x45c89879  // ummla z25.s, z3.b, z8.b\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      ".inst 0x45c49830  // ummla z16.s, z1.b, z4.b\n"
+      ".inst 0x45c49878  // ummla z24.s, z3.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45c59834  // ummla z20.s, z1.b, z5.b\n"
+      ".inst 0x45c5987c  // ummla z28.s, z3.b, z5.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c49831  // ummla z17.s, z1.b, z4.b\n"
+      ".inst 0x45c49879  // ummla z25.s, z3.b, z4.b\n"
       "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
-      ".inst 0x45c9987d  // ummla z29.s, z3.b, z9.b\n"
-      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
-      ".inst 0x45ca987a  // ummla z26.s, z3.b, z10.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45c89835  // ummla z21.s, z1.b, z8.b\n"
+      ".inst 0x45c8987d  // ummla z29.s, z3.b, z8.b\n"
+      ".inst 0x45c79832  // ummla z18.s, z1.b, z7.b\n"
+      ".inst 0x45c7987a  // ummla z26.s, z3.b, z7.b\n"
       "addvl x28, x28, #8\n"
-      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
-      ".inst 0x45c4987e  // ummla z30.s, z3.b, z4.b\n"
+      ".inst 0x45c69836  // ummla z22.s, z1.b, z6.b\n"
+      ".inst 0x45c6987e  // ummla z30.s, z3.b, z6.b\n"
       ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
       ".inst 0x45c5987b  // ummla z27.s, z3.b, z5.b\n"
-      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
-      ".inst 0x45c6987f  // ummla z31.s, z3.b, z6.b\n"
+      ".inst 0x45c49837  // ummla z23.s, z1.b, z4.b\n"
+      ".inst 0x45c4987f  // ummla z31.s, z3.b, z4.b\n"
       "38:"  // Height 3: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 39f\n"
       "udot z11.s, z0.b, z15.b\n"
@@ -770,12 +770,12 @@
       "cmp x26, x20\n"
       "bne 32b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "uzp1 z7.d, z16.d, z20.d\n"
-      "add x22, x27, x20\n"
+      "uzp1 z0.d, z16.d, z20.d\n"
+      "add x23, x27, x20\n"
       "uzp2 z16.d, z16.d, z20.d\n"
       "uzp1 z20.d, z17.d, z21.d\n"
       "uzp2 z17.d, z17.d, z21.d\n"
-      "add x21, x22, x20\n"
+      "add x22, x23, x20\n"
       "uzp1 z21.d, z18.d, z22.d\n"
       "uzp2 z18.d, z18.d, z22.d\n"
       "uzp1 z22.d, z19.d, z23.d\n"
@@ -784,170 +784,170 @@
       "uzp1 z25.d, z25.d, z29.d\n"
       "uzp1 z26.d, z26.d, z30.d\n"
       "uzp1 z27.d, z27.d, z31.d\n"
-      "mov z31.d, z7.d\n"
+      "mov z31.d, z0.d\n"
       "tbnz %x[flags], #31, 40f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1rw { z3.s }, p2/Z, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
       ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
       ".inst 0x4491a9ad  // addp z13.s, p2/m, z13.s, z13.s\n"
-      "neg z3.s, p2/M, z3.s\n"
+      "neg z23.s, p2/M, z23.s\n"
       "mov z12.s, z11.s[3]\n"
       "mov z11.s, z11.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z3.s\n"
+      "mul z11.s, p2/M, z11.s, z23.s\n"
       "mov z13.s, z13.s[0]\n"
-      "mul z12.s, p2/M, z12.s, z3.s\n"
-      "mul z13.s, p2/M, z13.s, z3.s\n"
+      "mul z12.s, p2/M, z12.s, z23.s\n"
+      "mul z13.s, p2/M, z13.s, z23.s\n"
       "40:"  // Height 3: skip row sum fixup
       "add z31.s, z31.s, z11.s\n"
       "add z20.s, z20.s, z11.s\n"
       "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z21.s, z21.s, z11.s\n"
       "add z22.s, z22.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z29.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z16.s, z16.s, z12.s\n"
       "add z17.s, z17.s, z12.s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "add z18.s, z18.s, z12.s\n"
       "add z19.s, z19.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add z24.s, z24.s, z13.s\n"
       "add z25.s, z25.s, z13.s\n"
       "addvl x10, x10, #4\n"
       "add z26.s, z26.s, z13.s\n"
       "add z27.s, z27.s, z13.s\n"
       "add z31.s, z31.s, z0.s\n"
-      "add z20.s, z20.s, z1.s\n"
-      "add z21.s, z21.s, z2.s\n"
-      "add z22.s, z22.s, z3.s\n"
+      "add z20.s, z20.s, z30.s\n"
+      "add z21.s, z21.s, z29.s\n"
+      "add z22.s, z22.s, z28.s\n"
       "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
+      "add z17.s, z17.s, z30.s\n"
+      "add z18.s, z18.s, z29.s\n"
+      "add z19.s, z19.s, z28.s\n"
       "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
-      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
-      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
-      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
-      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      "add z25.s, z25.s, z30.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z26.s, z26.s, z29.s\n"
+      "add z27.s, z27.s, z28.s\n"
+      ".inst 0x04b777ff  // sqrdmulh z31.s, z31.s, z23.s\n"
+      ".inst 0x04b77694  // sqrdmulh z20.s, z20.s, z23.s\n"
+      ".inst 0x04b776b5  // sqrdmulh z21.s, z21.s, z23.s\n"
+      ".inst 0x04b776d6  // sqrdmulh z22.s, z22.s, z23.s\n"
+      ".inst 0x04b77610  // sqrdmulh z16.s, z16.s, z23.s\n"
+      ".inst 0x04b77631  // sqrdmulh z17.s, z17.s, z23.s\n"
+      ".inst 0x04b77652  // sqrdmulh z18.s, z18.s, z23.s\n"
+      ".inst 0x04b77673  // sqrdmulh z19.s, z19.s, z23.s\n"
+      ".inst 0x04b77718  // sqrdmulh z24.s, z24.s, z23.s\n"
+      ".inst 0x04b77739  // sqrdmulh z25.s, z25.s, z23.s\n"
+      ".inst 0x04b7775a  // sqrdmulh z26.s, z26.s, z23.s\n"
+      ".inst 0x04b7777b  // sqrdmulh z27.s, z27.s, z23.s\n"
       "tbz %x[flags], #5, 41f\n"
-      "and z4.d, z31.d, z0.d\n"
-      "and z5.d, z20.d, z0.d\n"
-      "and z6.d, z21.d, z0.d\n"
-      "and z7.d, z22.d, z0.d\n"
-      "and z8.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "sqadd z31.s, z31.s, z4.s\n"
-      "sqadd z20.s, z20.s, z5.s\n"
-      "sqadd z21.s, z21.s, z6.s\n"
-      "sqadd z22.s, z22.s, z7.s\n"
-      "sqadd z16.s, z16.s, z8.s\n"
-      "and z9.d, z17.d, z0.d\n"
-      "and z10.d, z18.d, z0.d\n"
-      "and z4.d, z19.d, z0.d\n"
-      "and z5.d, z24.d, z0.d\n"
-      "and z6.d, z25.d, z0.d\n"
-      "and z7.d, z26.d, z0.d\n"
-      "and z8.d, z27.d, z0.d\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z9.s\n"
-      "sqadd z18.s, z18.s, z10.s\n"
-      "sqadd z19.s, z19.s, z4.s\n"
-      "sqadd z24.s, z24.s, z5.s\n"
-      "sqadd z25.s, z25.s, z6.s\n"
-      "sqadd z26.s, z26.s, z7.s\n"
-      "sqadd z27.s, z27.s, z8.s\n"
+      "and z1.d, z31.d, z0.d\n"
+      "and z30.d, z20.d, z0.d\n"
+      "and z29.d, z21.d, z0.d\n"
+      "and z28.d, z22.d, z0.d\n"
+      "and z23.d, z16.d, z0.d\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z1.s\n"
+      "sqadd z20.s, z20.s, z30.s\n"
+      "sqadd z21.s, z21.s, z29.s\n"
+      "sqadd z22.s, z22.s, z28.s\n"
+      "sqadd z16.s, z16.s, z23.s\n"
+      "and z3.d, z17.d, z0.d\n"
+      "and z2.d, z18.d, z0.d\n"
+      "and z1.d, z19.d, z0.d\n"
+      "and z30.d, z24.d, z0.d\n"
+      "and z29.d, z25.d, z0.d\n"
+      "and z28.d, z26.d, z0.d\n"
+      "and z23.d, z27.d, z0.d\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "asr z30.s, z30.s, #0x1f\n"
+      "asr z29.s, z29.s, #0x1f\n"
+      "asr z28.s, z28.s, #0x1f\n"
+      "asr z23.s, z23.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z3.s\n"
+      "sqadd z18.s, z18.s, z2.s\n"
+      "sqadd z19.s, z19.s, z1.s\n"
+      "sqadd z24.s, z24.s, z30.s\n"
+      "sqadd z25.s, z25.s, z29.s\n"
+      "sqadd z26.s, z26.s, z28.s\n"
+      "sqadd z27.s, z27.s, z23.s\n"
       "41:"  // Height 3: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
       ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
-      "add z31.s, z31.s, z4.s\n"
+      "add z31.s, z31.s, z23.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "add z21.s, z21.s, z4.s\n"
+      "add z20.s, z20.s, z23.s\n"
+      "add z21.s, z21.s, z23.s\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z22.s, z22.s, z23.s\n"
+      "add z16.s, z16.s, z23.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z23.s\n"
+      "add z18.s, z18.s, z23.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
+      "add z19.s, z19.s, z23.s\n"
+      "add z24.s, z24.s, z23.s\n"
       ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
       ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "add z26.s, z26.s, z4.s\n"
+      "add z25.s, z25.s, z23.s\n"
+      "add z26.s, z26.s, z23.s\n"
       ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x23]\n"
-      "add z27.s, z27.s, z4.s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x23]\n"
-      "smin z31.s, p2/M, z31.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smax z31.s, p2/M, z31.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z28.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z23.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z23.s }, p2/Z, [x20]\n"
+      "smin z31.s, p2/M, z31.s, z28.s\n"
+      "smin z20.s, p2/M, z20.s, z28.s\n"
+      "smin z21.s, p2/M, z21.s, z28.s\n"
+      "smin z22.s, p2/M, z22.s, z28.s\n"
+      "smin z16.s, p2/M, z16.s, z28.s\n"
+      "smin z17.s, p2/M, z17.s, z28.s\n"
+      "smin z18.s, p2/M, z18.s, z28.s\n"
+      "smin z19.s, p2/M, z19.s, z28.s\n"
+      "smin z24.s, p2/M, z24.s, z28.s\n"
+      "smin z25.s, p2/M, z25.s, z28.s\n"
+      "smin z26.s, p2/M, z26.s, z28.s\n"
+      "smin z27.s, p2/M, z27.s, z28.s\n"
+      "smax z31.s, p2/M, z31.s, z23.s\n"
+      "smax z20.s, p2/M, z20.s, z23.s\n"
+      "smax z21.s, p2/M, z21.s, z23.s\n"
       "uzp1 z31.h, z31.h, z20.h\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z22.s, p2/M, z22.s, z23.s\n"
+      "smax z16.s, p2/M, z16.s, z23.s\n"
       "uzp1 z20.h, z21.h, z22.h\n"
       "uzp1 z31.b, z31.b, z20.b\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z23.s\n"
+      "smax z18.s, p2/M, z18.s, z23.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
       "st1b { z31.b }, p1, [x27]\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z23.s\n"
+      "smax z24.s, p2/M, z24.s, z23.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z25.s, p2/M, z25.s, z23.s\n"
+      "smax z26.s, p2/M, z26.s, z23.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
-      "st1b { z16.b }, p1, [x22]\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "st1b { z24.b }, p1, [x21]\n"
+      "st1b { z16.b }, p1, [x23]\n"
+      "smax z27.s, p2/M, z27.s, z23.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x22]\n"
       "addvl x27, x27, #1\n"
       "42:"  // Height 3: Writeback done
       "decw x9, ALL, MUL #4\n"
@@ -992,14 +992,14 @@
       "46:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w25, [x20, x26, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 47f\n"
-      "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x24, [x21, #0x0]\n"
-      "ldr x23, [x21, #0x8]\n"
-      "ldr x22, [x21, #0x10]\n"
-      "ldr x21, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x24, [x20, #0x0]\n"
+      "ldr x23, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x21, [x20, #0x18]\n"
       "cbnz x26, 48f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x24, x24, x20\n"
@@ -1009,9 +1009,9 @@
       "b 48f\n"
       "47:"  // Height 4: setup direct input
       "mov x24, %x[input_ptr]\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "48:"  // Height 4: input setup done
       "cmp x25, #0x10\n"
       "ble 51f\n"
@@ -1021,63 +1021,63 @@
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
       "trn1 z0.d, z1.d, z2.d\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
       "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
-      ".inst 0x45c59858  // ummla z24.s, z2.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
-      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
-      ".inst 0x45c6985c  // ummla z28.s, z2.b, z6.b\n"
-      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
-      ".inst 0x45c79859  // ummla z25.s, z2.b, z7.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
-      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
-      ".inst 0x45c8985d  // ummla z29.s, z2.b, z8.b\n"
+      "trn1 z2.d, z3.d, z5.d\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      "trn2 z3.d, z3.d, z5.d\n"
+      ".inst 0x45c49810  // ummla z16.s, z0.b, z4.b\n"
+      ".inst 0x45c49858  // ummla z24.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x45c49814  // ummla z20.s, z0.b, z4.b\n"
+      ".inst 0x45c4985c  // ummla z28.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45c59811  // ummla z17.s, z0.b, z5.b\n"
+      ".inst 0x45c59859  // ummla z25.s, z2.b, z5.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45c49815  // ummla z21.s, z0.b, z4.b\n"
+      ".inst 0x45c4985d  // ummla z29.s, z2.b, z4.b\n"
       "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
       "addvl x28, x28, #16\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
-      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
-      ".inst 0x45c9985a  // ummla z26.s, z2.b, z9.b\n"
-      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
-      ".inst 0x45ca985e  // ummla z30.s, z2.b, z10.b\n"
-      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
-      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
-      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
-      ".inst 0x45c4985b  // ummla z27.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45c89812  // ummla z18.s, z0.b, z8.b\n"
+      ".inst 0x45c8985a  // ummla z26.s, z2.b, z8.b\n"
+      ".inst 0x45c79816  // ummla z22.s, z0.b, z7.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x45c7985e  // ummla z30.s, z2.b, z7.b\n"
+      ".inst 0x45c69813  // ummla z19.s, z0.b, z6.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x45c6985b  // ummla z27.s, z2.b, z6.b\n"
       ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
-      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
       "add x24, x24, #0x10\n"
       ".inst 0x45c5985f  // ummla z31.s, z2.b, z5.b\n"
-      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
+      ".inst 0x45c49830  // ummla z16.s, z1.b, z4.b\n"
       "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x45c69878  // ummla z24.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
-      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
+      ".inst 0x45c49878  // ummla z24.s, z3.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x45ca9834  // ummla z20.s, z1.b, z10.b\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x45c7987c  // ummla z28.s, z3.b, z7.b\n"
-      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
+      ".inst 0x45ca987c  // ummla z28.s, z3.b, z10.b\n"
+      ".inst 0x45c99831  // ummla z17.s, z1.b, z9.b\n"
       "add x21, x21, #0x10\n"
-      ".inst 0x45c89879  // ummla z25.s, z3.b, z8.b\n"
-      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
-      ".inst 0x45c9987d  // ummla z29.s, z3.b, z9.b\n"
-      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
-      ".inst 0x45ca987a  // ummla z26.s, z3.b, z10.b\n"
-      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
-      ".inst 0x45c4987e  // ummla z30.s, z3.b, z4.b\n"
+      ".inst 0x45c99879  // ummla z25.s, z3.b, z9.b\n"
+      ".inst 0x45c89835  // ummla z21.s, z1.b, z8.b\n"
+      ".inst 0x45c8987d  // ummla z29.s, z3.b, z8.b\n"
+      ".inst 0x45c79832  // ummla z18.s, z1.b, z7.b\n"
+      ".inst 0x45c7987a  // ummla z26.s, z3.b, z7.b\n"
+      ".inst 0x45c69836  // ummla z22.s, z1.b, z6.b\n"
+      ".inst 0x45c6987e  // ummla z30.s, z3.b, z6.b\n"
       ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
       ".inst 0x45c5987b  // ummla z27.s, z3.b, z5.b\n"
-      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
-      ".inst 0x45c6987f  // ummla z31.s, z3.b, z6.b\n"
+      ".inst 0x45c49837  // ummla z23.s, z1.b, z4.b\n"
+      ".inst 0x45c4987f  // ummla z31.s, z3.b, z4.b\n"
       "tbnz %x[flags], #31, 50f\n"
       "udot z11.s, z0.b, z15.b\n"
       "udot z13.s, z2.b, z15.b\n"
@@ -1093,62 +1093,62 @@
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
       "trn1 z0.d, z1.d, z2.d\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "ld1rqb { z4.b }, p0/Z, [x21]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
       "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z5.b }, p2/Z, [x28]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
-      ".inst 0x45c59858  // ummla z24.s, z2.b, z5.b\n"
-      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "trn1 z2.d, z3.d, z5.d\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      "trn2 z3.d, z3.d, z5.d\n"
+      ".inst 0x45c49810  // ummla z16.s, z0.b, z4.b\n"
+      ".inst 0x45c49858  // ummla z24.s, z2.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
       "subs x25, x25, #0x8\n"
-      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
+      ".inst 0x45c59814  // ummla z20.s, z0.b, z5.b\n"
       "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
-      ".inst 0x45c6985c  // ummla z28.s, z2.b, z6.b\n"
-      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
-      ".inst 0x45c79859  // ummla z25.s, z2.b, z7.b\n"
-      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
-      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45c8985d  // ummla z29.s, z2.b, z8.b\n"
-      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
-      "addvl x28, x28, #8\n"
-      ".inst 0x45c9985a  // ummla z26.s, z2.b, z9.b\n"
-      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
-      ".inst 0x45ca985e  // ummla z30.s, z2.b, z10.b\n"
-      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
-      ".inst 0x45c4985b  // ummla z27.s, z2.b, z4.b\n"
-      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
-      ".inst 0x45c5985f  // ummla z31.s, z2.b, z5.b\n"
-      "ble 52f\n"
-      "ld1b { z6.b }, p2/Z, [x28]\n"
-      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
-      ".inst 0x45c69878  // ummla z24.s, z3.b, z6.b\n"
-      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
-      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
-      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
-      ".inst 0x45c7987c  // ummla z28.s, z3.b, z7.b\n"
-      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
-      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
-      ".inst 0x45c89879  // ummla z25.s, z3.b, z8.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45c5985c  // ummla z28.s, z2.b, z5.b\n"
+      ".inst 0x45c49811  // ummla z17.s, z0.b, z4.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
       "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
-      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
-      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
-      ".inst 0x45c9987d  // ummla z29.s, z3.b, z9.b\n"
-      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
-      ".inst 0x45ca987a  // ummla z26.s, z3.b, z10.b\n"
+      ".inst 0x45c49859  // ummla z25.s, z2.b, z4.b\n"
+      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45c8985d  // ummla z29.s, z2.b, z8.b\n"
+      ".inst 0x45c79812  // ummla z18.s, z0.b, z7.b\n"
       "addvl x28, x28, #8\n"
-      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
-      ".inst 0x45c4987e  // ummla z30.s, z3.b, z4.b\n"
+      ".inst 0x45c7985a  // ummla z26.s, z2.b, z7.b\n"
+      ".inst 0x45c69816  // ummla z22.s, z0.b, z6.b\n"
+      ".inst 0x45c6985e  // ummla z30.s, z2.b, z6.b\n"
+      ".inst 0x45c59813  // ummla z19.s, z0.b, z5.b\n"
+      ".inst 0x45c5985b  // ummla z27.s, z2.b, z5.b\n"
+      ".inst 0x45c49817  // ummla z23.s, z0.b, z4.b\n"
+      ".inst 0x45c4985f  // ummla z31.s, z2.b, z4.b\n"
+      "ble 52f\n"
+      "ld1b { z4.b }, p2/Z, [x28]\n"
+      ".inst 0x45c49830  // ummla z16.s, z1.b, z4.b\n"
+      ".inst 0x45c49878  // ummla z24.s, z3.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45c59834  // ummla z20.s, z1.b, z5.b\n"
+      ".inst 0x45c5987c  // ummla z28.s, z3.b, z5.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c49831  // ummla z17.s, z1.b, z4.b\n"
+      ".inst 0x45c49879  // ummla z25.s, z3.b, z4.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45c89835  // ummla z21.s, z1.b, z8.b\n"
+      ".inst 0x45c8987d  // ummla z29.s, z3.b, z8.b\n"
+      ".inst 0x45c79832  // ummla z18.s, z1.b, z7.b\n"
+      ".inst 0x45c7987a  // ummla z26.s, z3.b, z7.b\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45c69836  // ummla z22.s, z1.b, z6.b\n"
+      ".inst 0x45c6987e  // ummla z30.s, z3.b, z6.b\n"
       ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
       ".inst 0x45c5987b  // ummla z27.s, z3.b, z5.b\n"
-      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
-      ".inst 0x45c6987f  // ummla z31.s, z3.b, z6.b\n"
+      ".inst 0x45c49837  // ummla z23.s, z1.b, z4.b\n"
+      ".inst 0x45c4987f  // ummla z31.s, z3.b, z4.b\n"
       "52:"  // Height 4: Multiply loop: multiply skip
       "tbnz %x[flags], #31, 53f\n"
       "udot z11.s, z0.b, z15.b\n"
@@ -1161,12 +1161,12 @@
       "cmp x26, x20\n"
       "bne 46b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "uzp1 z7.d, z16.d, z20.d\n"
-      "add x22, x27, x20\n"
-      "add x21, x22, x20\n"
+      "uzp1 z0.d, z16.d, z20.d\n"
+      "add x23, x27, x20\n"
+      "add x22, x23, x20\n"
       "uzp2 z16.d, z16.d, z20.d\n"
       "uzp1 z20.d, z17.d, z21.d\n"
-      "add x20, x21, x20\n"
+      "add x21, x22, x20\n"
       "uzp2 z17.d, z17.d, z21.d\n"
       "uzp1 z21.d, z18.d, z22.d\n"
       "uzp2 z18.d, z18.d, z22.d\n"
@@ -1180,38 +1180,38 @@
       "uzp2 z26.d, z26.d, z30.d\n"
       "uzp1 z30.d, z27.d, z31.d\n"
       "uzp2 z27.d, z27.d, z31.d\n"
-      "mov z31.d, z7.d\n"
+      "mov z31.d, z0.d\n"
       "tbnz %x[flags], #31, 54f\n"
-      "add x23, %x[qp], %[b_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
       ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
       ".inst 0x4491a9ad  // addp z13.s, p2/m, z13.s, z13.s\n"
-      "neg z4.s, p2/M, z4.s\n"
+      "neg z0.s, p2/M, z0.s\n"
       "mov z12.s, z11.s[3]\n"
       "mov z11.s, z11.s[0]\n"
-      "mul z11.s, p2/M, z11.s, z4.s\n"
+      "mul z11.s, p2/M, z11.s, z0.s\n"
       "mov z14.s, z13.s[3]\n"
       "mov z13.s, z13.s[0]\n"
-      "mul z12.s, p2/M, z12.s, z4.s\n"
-      "mul z13.s, p2/M, z13.s, z4.s\n"
-      "mul z14.s, p2/M, z14.s, z4.s\n"
+      "mul z12.s, p2/M, z12.s, z0.s\n"
+      "mul z13.s, p2/M, z13.s, z0.s\n"
+      "mul z14.s, p2/M, z14.s, z0.s\n"
       "54:"  // Height 4: skip row sum fixup
       "add z31.s, z31.s, z11.s\n"
       "add z20.s, z20.s, z11.s\n"
-      "ld1w { z0.s }, p2/Z, [x10]\n"
-      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "ld1w { z4.s }, p2/Z, [x10]\n"
+      "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n"
       "add z21.s, z21.s, z11.s\n"
       "add z22.s, z22.s, z11.s\n"
-      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
-      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n"
       "add z16.s, z16.s, z12.s\n"
       "add z17.s, z17.s, z12.s\n"
-      "add x23, %x[qp], %[per_layer_mul]\n"
+      "add x20, %x[qp], %[per_layer_mul]\n"
       "orr %x[flags], %x[flags], #0x80000000\n"
       "add z18.s, z18.s, z12.s\n"
       "add z19.s, z19.s, z12.s\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
-      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
       "add z23.s, z23.s, z13.s\n"
       "add z28.s, z28.s, z13.s\n"
       "addvl x10, x10, #4\n"
@@ -1221,175 +1221,175 @@
       "add z25.s, z25.s, z14.s\n"
       "add z26.s, z26.s, z14.s\n"
       "add z27.s, z27.s, z14.s\n"
-      "add z31.s, z31.s, z0.s\n"
-      "add z20.s, z20.s, z1.s\n"
-      "add z21.s, z21.s, z2.s\n"
-      "add z22.s, z22.s, z3.s\n"
-      "add z16.s, z16.s, z0.s\n"
-      "add z17.s, z17.s, z1.s\n"
-      "add z18.s, z18.s, z2.s\n"
-      "add z19.s, z19.s, z3.s\n"
-      "add z23.s, z23.s, z0.s\n"
-      "add z28.s, z28.s, z1.s\n"
-      "add z29.s, z29.s, z2.s\n"
-      "add z30.s, z30.s, z3.s\n"
-      "add z24.s, z24.s, z0.s\n"
-      "add z25.s, z25.s, z1.s\n"
-      "ld1rw { z0.s }, p2/Z, [x23]\n"
-      "add z26.s, z26.s, z2.s\n"
-      "add z27.s, z27.s, z3.s\n"
-      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
-      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
-      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
-      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
-      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
-      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
-      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
-      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
-      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
-      ".inst 0x04a4779c  // sqrdmulh z28.s, z28.s, z4.s\n"
-      ".inst 0x04a477bd  // sqrdmulh z29.s, z29.s, z4.s\n"
-      ".inst 0x04a477de  // sqrdmulh z30.s, z30.s, z4.s\n"
-      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
-      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
-      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
-      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
-      "tbz %x[flags], #5, 55f\n"
-      "and z4.d, z31.d, z0.d\n"
-      "and z5.d, z20.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z31.s, z31.s, z4.s\n"
-      "sqadd z20.s, z20.s, z5.s\n"
-      "and z6.d, z21.d, z0.d\n"
-      "and z7.d, z22.d, z0.d\n"
-      "and z8.d, z16.d, z0.d\n"
-      "and z9.d, z17.d, z0.d\n"
-      "and z10.d, z18.d, z0.d\n"
-      "and z4.d, z19.d, z0.d\n"
-      "and z5.d, z23.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z21.s, z21.s, z6.s\n"
-      "sqadd z22.s, z22.s, z7.s\n"
-      "sqadd z16.s, z16.s, z8.s\n"
-      "sqadd z17.s, z17.s, z9.s\n"
-      "sqadd z18.s, z18.s, z10.s\n"
-      "sqadd z19.s, z19.s, z4.s\n"
-      "sqadd z23.s, z23.s, z5.s\n"
-      "and z6.d, z28.d, z0.d\n"
-      "and z7.d, z29.d, z0.d\n"
-      "and z8.d, z30.d, z0.d\n"
-      "and z9.d, z24.d, z0.d\n"
-      "and z10.d, z25.d, z0.d\n"
-      "and z4.d, z26.d, z0.d\n"
-      "and z5.d, z27.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "asr z8.s, z8.s, #0x1f\n"
-      "asr z9.s, z9.s, #0x1f\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "asr z5.s, z5.s, #0x1f\n"
-      "sqadd z28.s, z28.s, z6.s\n"
-      "sqadd z29.s, z29.s, z7.s\n"
-      "sqadd z30.s, z30.s, z8.s\n"
-      "sqadd z24.s, z24.s, z9.s\n"
-      "sqadd z25.s, z25.s, z10.s\n"
-      "sqadd z26.s, z26.s, z4.s\n"
-      "sqadd z27.s, z27.s, z5.s\n"
-      "55:"  // Height 4: no shift correction
-      "add x23, %x[qp], %[c_offset]\n"
-      "ld1rw { z4.s }, p2/Z, [x23]\n"
-      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
       "add z31.s, z31.s, z4.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z3.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z0.s\n"
+      "add z18.s, z18.s, z3.s\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z28.s, z28.s, z0.s\n"
+      "add z29.s, z29.s, z3.s\n"
+      "add z30.s, z30.s, z2.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z26.s, z26.s, z3.s\n"
+      "add z27.s, z27.s, z2.s\n"
+      ".inst 0x04a177ff  // sqrdmulh z31.s, z31.s, z1.s\n"
+      ".inst 0x04a17694  // sqrdmulh z20.s, z20.s, z1.s\n"
+      ".inst 0x04a176b5  // sqrdmulh z21.s, z21.s, z1.s\n"
+      ".inst 0x04a176d6  // sqrdmulh z22.s, z22.s, z1.s\n"
+      ".inst 0x04a17610  // sqrdmulh z16.s, z16.s, z1.s\n"
+      ".inst 0x04a17631  // sqrdmulh z17.s, z17.s, z1.s\n"
+      ".inst 0x04a17652  // sqrdmulh z18.s, z18.s, z1.s\n"
+      ".inst 0x04a17673  // sqrdmulh z19.s, z19.s, z1.s\n"
+      ".inst 0x04a176f7  // sqrdmulh z23.s, z23.s, z1.s\n"
+      ".inst 0x04a1779c  // sqrdmulh z28.s, z28.s, z1.s\n"
+      ".inst 0x04a177bd  // sqrdmulh z29.s, z29.s, z1.s\n"
+      ".inst 0x04a177de  // sqrdmulh z30.s, z30.s, z1.s\n"
+      ".inst 0x04a17718  // sqrdmulh z24.s, z24.s, z1.s\n"
+      ".inst 0x04a17739  // sqrdmulh z25.s, z25.s, z1.s\n"
+      ".inst 0x04a1775a  // sqrdmulh z26.s, z26.s, z1.s\n"
+      ".inst 0x04a1777b  // sqrdmulh z27.s, z27.s, z1.s\n"
+      "tbz %x[flags], #5, 55f\n"
+      "and z2.d, z31.d, z0.d\n"
+      "and z1.d, z20.d, z0.d\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z2.s\n"
+      "sqadd z20.s, z20.s, z1.s\n"
+      "and z7.d, z21.d, z0.d\n"
+      "and z6.d, z22.d, z0.d\n"
+      "and z5.d, z16.d, z0.d\n"
+      "and z4.d, z17.d, z0.d\n"
+      "and z3.d, z18.d, z0.d\n"
+      "and z2.d, z19.d, z0.d\n"
+      "and z1.d, z23.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z21.s, z21.s, z7.s\n"
+      "sqadd z22.s, z22.s, z6.s\n"
+      "sqadd z16.s, z16.s, z5.s\n"
+      "sqadd z17.s, z17.s, z4.s\n"
+      "sqadd z18.s, z18.s, z3.s\n"
+      "sqadd z19.s, z19.s, z2.s\n"
+      "sqadd z23.s, z23.s, z1.s\n"
+      "and z7.d, z28.d, z0.d\n"
+      "and z6.d, z29.d, z0.d\n"
+      "and z5.d, z30.d, z0.d\n"
+      "and z4.d, z24.d, z0.d\n"
+      "and z3.d, z25.d, z0.d\n"
+      "and z2.d, z26.d, z0.d\n"
+      "and z1.d, z27.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z3.s, z3.s, #0x1f\n"
+      "asr z2.s, z2.s, #0x1f\n"
+      "asr z1.s, z1.s, #0x1f\n"
+      "sqadd z28.s, z28.s, z7.s\n"
+      "sqadd z29.s, z29.s, z6.s\n"
+      "sqadd z30.s, z30.s, z5.s\n"
+      "sqadd z24.s, z24.s, z4.s\n"
+      "sqadd z25.s, z25.s, z3.s\n"
+      "sqadd z26.s, z26.s, z2.s\n"
+      "sqadd z27.s, z27.s, z1.s\n"
+      "55:"  // Height 4: no shift correction
+      "add x20, %x[qp], %[c_offset]\n"
+      "ld1rw { z2.s }, p2/Z, [x20]\n"
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "add z31.s, z31.s, z2.s\n"
       ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
       ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
-      "add z20.s, z20.s, z4.s\n"
-      "add z21.s, z21.s, z4.s\n"
+      "add z20.s, z20.s, z2.s\n"
+      "add z21.s, z21.s, z2.s\n"
       ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
-      "add z22.s, z22.s, z4.s\n"
-      "add z16.s, z16.s, z4.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z16.s, z16.s, z2.s\n"
       ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
       ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
-      "add z17.s, z17.s, z4.s\n"
-      "add z18.s, z18.s, z4.s\n"
+      "add z17.s, z17.s, z2.s\n"
+      "add z18.s, z18.s, z2.s\n"
       ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
       ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
-      "add z19.s, z19.s, z4.s\n"
-      "add z23.s, z23.s, z4.s\n"
+      "add z19.s, z19.s, z2.s\n"
+      "add z23.s, z23.s, z2.s\n"
       ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
       ".inst 0x4482881d  // srshl z29.s, p2/M, z29.s, z0.s\n"
-      "add z28.s, z28.s, z4.s\n"
-      "add z29.s, z29.s, z4.s\n"
+      "add z28.s, z28.s, z2.s\n"
+      "add z29.s, z29.s, z2.s\n"
       ".inst 0x4482881e  // srshl z30.s, p2/M, z30.s, z0.s\n"
       ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
-      "add z30.s, z30.s, z4.s\n"
-      "add z24.s, z24.s, z4.s\n"
+      "add z30.s, z30.s, z2.s\n"
+      "add z24.s, z24.s, z2.s\n"
       ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
       ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
-      "add z25.s, z25.s, z4.s\n"
-      "add z26.s, z26.s, z4.s\n"
+      "add z25.s, z25.s, z2.s\n"
+      "add z26.s, z26.s, z2.s\n"
       ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
-      "add x23, %x[qp], %[maxval]\n"
-      "ld1rw { z6.s }, p2/Z, [x23]\n"
-      "add z27.s, z27.s, z4.s\n"
-      "add x23, %x[qp], %[minval]\n"
-      "ld1rw { z5.s }, p2/Z, [x23]\n"
-      "smin z31.s, p2/M, z31.s, z6.s\n"
-      "smin z20.s, p2/M, z20.s, z6.s\n"
-      "smin z21.s, p2/M, z21.s, z6.s\n"
-      "smin z22.s, p2/M, z22.s, z6.s\n"
-      "smin z16.s, p2/M, z16.s, z6.s\n"
-      "smin z17.s, p2/M, z17.s, z6.s\n"
-      "smin z18.s, p2/M, z18.s, z6.s\n"
-      "smin z19.s, p2/M, z19.s, z6.s\n"
-      "smin z23.s, p2/M, z23.s, z6.s\n"
-      "smin z28.s, p2/M, z28.s, z6.s\n"
-      "smin z29.s, p2/M, z29.s, z6.s\n"
-      "smin z30.s, p2/M, z30.s, z6.s\n"
-      "smin z24.s, p2/M, z24.s, z6.s\n"
-      "smin z25.s, p2/M, z25.s, z6.s\n"
-      "smin z26.s, p2/M, z26.s, z6.s\n"
-      "smin z27.s, p2/M, z27.s, z6.s\n"
-      "smax z31.s, p2/M, z31.s, z5.s\n"
-      "smax z20.s, p2/M, z20.s, z5.s\n"
-      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "add x20, %x[qp], %[maxval]\n"
+      "ld1rw { z1.s }, p2/Z, [x20]\n"
+      "add z27.s, z27.s, z2.s\n"
+      "add x20, %x[qp], %[minval]\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "smin z31.s, p2/M, z31.s, z1.s\n"
+      "smin z20.s, p2/M, z20.s, z1.s\n"
+      "smin z21.s, p2/M, z21.s, z1.s\n"
+      "smin z22.s, p2/M, z22.s, z1.s\n"
+      "smin z16.s, p2/M, z16.s, z1.s\n"
+      "smin z17.s, p2/M, z17.s, z1.s\n"
+      "smin z18.s, p2/M, z18.s, z1.s\n"
+      "smin z19.s, p2/M, z19.s, z1.s\n"
+      "smin z23.s, p2/M, z23.s, z1.s\n"
+      "smin z28.s, p2/M, z28.s, z1.s\n"
+      "smin z29.s, p2/M, z29.s, z1.s\n"
+      "smin z30.s, p2/M, z30.s, z1.s\n"
+      "smin z24.s, p2/M, z24.s, z1.s\n"
+      "smin z25.s, p2/M, z25.s, z1.s\n"
+      "smin z26.s, p2/M, z26.s, z1.s\n"
+      "smin z27.s, p2/M, z27.s, z1.s\n"
+      "smax z31.s, p2/M, z31.s, z0.s\n"
+      "smax z20.s, p2/M, z20.s, z0.s\n"
+      "smax z21.s, p2/M, z21.s, z0.s\n"
       "uzp1 z31.h, z31.h, z20.h\n"
-      "smax z22.s, p2/M, z22.s, z5.s\n"
-      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z22.s, p2/M, z22.s, z0.s\n"
+      "smax z16.s, p2/M, z16.s, z0.s\n"
       "uzp1 z20.h, z21.h, z22.h\n"
       "uzp1 z31.b, z31.b, z20.b\n"
-      "smax z17.s, p2/M, z17.s, z5.s\n"
-      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z0.s\n"
+      "smax z18.s, p2/M, z18.s, z0.s\n"
       "uzp1 z16.h, z16.h, z17.h\n"
       "st1b { z31.b }, p1, [x27]\n"
-      "smax z19.s, p2/M, z19.s, z5.s\n"
-      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z0.s\n"
+      "smax z23.s, p2/M, z23.s, z0.s\n"
       "uzp1 z17.h, z18.h, z19.h\n"
       "uzp1 z16.b, z16.b, z17.b\n"
-      "smax z28.s, p2/M, z28.s, z5.s\n"
-      "smax z29.s, p2/M, z29.s, z5.s\n"
+      "smax z28.s, p2/M, z28.s, z0.s\n"
+      "smax z29.s, p2/M, z29.s, z0.s\n"
       "uzp1 z23.h, z23.h, z28.h\n"
-      "st1b { z16.b }, p1, [x22]\n"
-      "smax z30.s, p2/M, z30.s, z5.s\n"
-      "smax z24.s, p2/M, z24.s, z5.s\n"
-      "uzp1 z28.h, z29.h, z30.h\n"
-      "uzp1 z23.b, z23.b, z28.b\n"
-      "smax z25.s, p2/M, z25.s, z5.s\n"
-      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "st1b { z16.b }, p1, [x23]\n"
+      "smax z30.s, p2/M, z30.s, z0.s\n"
+      "smax z24.s, p2/M, z24.s, z0.s\n"
+      "uzp1 z16.h, z29.h, z30.h\n"
+      "uzp1 z23.b, z23.b, z16.b\n"
+      "smax z25.s, p2/M, z25.s, z0.s\n"
+      "smax z26.s, p2/M, z26.s, z0.s\n"
       "uzp1 z24.h, z24.h, z25.h\n"
-      "st1b { z23.b }, p1, [x21]\n"
-      "smax z27.s, p2/M, z27.s, z5.s\n"
-      "uzp1 z25.h, z26.h, z27.h\n"
-      "uzp1 z24.b, z24.b, z25.b\n"
-      "st1b { z24.b }, p1, [x20]\n"
+      "st1b { z23.b }, p1, [x22]\n"
+      "smax z27.s, p2/M, z27.s, z0.s\n"
+      "uzp1 z16.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z16.b\n"
+      "st1b { z24.b }, p1, [x21]\n"
       "addvl x27, x27, #1\n"
       "56:"  // Height 4: Writeback done
       "decw x9, ALL, MUL #4\n"
@@ -1407,7 +1407,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "58:"  // Exit
-
       : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
       : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1415,4 +1414,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
index 901cc6d..e9197e8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -39,6 +39,7 @@
 {
 // Actual kernel implementations
 void sve_hybrid_u8u32_dot_6x4VL( ARGLIST );
+void sve_hybrid_u8u32_dot_6x4VL_a64fx( ARGLIST );
 
 class cls_sve_hybrid_u8u32_dot_6x4VL
 {
@@ -74,7 +75,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, uint32_t>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -83,10 +83,11 @@
                     return { 20.98 };
                 case CPUModel::V1:
                     return { 62.19 };
+                case CPUModel::A64FX:
+                    return { 91.23 };
             }
         }
 
-
         if (std::is_same<T, uint8_t>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -95,6 +96,8 @@
                     return { 22.75, 3.90, 0.47 };
                 case CPUModel::V1:
                     return { 48.09, 16.24, 0.83 };
+                case CPUModel::A64FX:
+                    return { 101.62, 3.15, 0.42 };
             }
         }
 
@@ -103,13 +106,19 @@
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_u8u32_dot_6x4VL;
-    cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *)
+    cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_hybrid_u8u32_dot_6x4VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp
index a7dbef3..4d0f449 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp
@@ -115,11 +115,11 @@
       "5:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 6f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 7f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -135,12 +135,12 @@
       "8:"  // Height 1: Multiply loop: Main loop
       "udot z8.s, z6.b, z0.b\n"
       "udot z9.s, z7.b, z0.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
       "add x26, x26, #0x4\n"
-      "udot z10.s, z6.b, z0.b\n"
-      "udot z11.s, z7.b, z0.b\n"
+      "udot z10.s, z17.b, z0.b\n"
+      "udot z11.s, z16.b, z0.b\n"
       "subs x27, x27, #0x4\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1b { z6.b }, p4/Z, [x10]\n"
@@ -150,12 +150,12 @@
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "udot z8.s, z6.b, z0.b\n"
       "udot z9.s, z7.b, z0.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
-      "udot z10.s, z6.b, z0.b\n"
-      "udot z11.s, z7.b, z0.b\n"
+      "udot z10.s, z17.b, z0.b\n"
+      "udot z11.s, z16.b, z0.b\n"
       "addvl x10, x10, #4\n"
       "bne 5b\n"
       "st1w { z8.s }, p3, [x9]\n"
@@ -183,15 +183,15 @@
       "whilelt p0.s, x20, x11\n"
       "tbz %x[flags], #0, 13f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
       "ld1w { z8.s }, p3/Z, [x9]\n"
       "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x24]\n"
-      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x20]\n"
+      "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 14f\n"
       "13:"  // Height 2: no accumulate
       "mov z8.s, #0x0\n"
@@ -207,12 +207,12 @@
       "15:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 16f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 17f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -220,7 +220,7 @@
       "b 17f\n"
       "16:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
+      "add x25, x26, x21\n"
       "17:"  // Height 2: input setup done
       "subs x27, x27, #0x4\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -231,18 +231,18 @@
       "18:"  // Height 2: Multiply loop: Main loop
       "udot z8.s, z6.b, z0.b\n"
       "udot z12.s, z6.b, z1.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
       "add x26, x26, #0x4\n"
       "udot z9.s, z7.b, z0.b\n"
       "udot z13.s, z7.b, z1.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
       "subs x27, x27, #0x4\n"
       "add x25, x25, #0x4\n"
-      "udot z10.s, z6.b, z0.b\n"
-      "udot z14.s, z6.b, z1.b\n"
-      "udot z11.s, z7.b, z0.b\n"
-      "udot z15.s, z7.b, z1.b\n"
+      "udot z10.s, z17.b, z0.b\n"
+      "udot z14.s, z17.b, z1.b\n"
+      "udot z11.s, z16.b, z0.b\n"
+      "udot z15.s, z16.b, z1.b\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
       "ld1b { z6.b }, p4/Z, [x10]\n"
@@ -252,29 +252,29 @@
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "udot z8.s, z6.b, z0.b\n"
       "udot z12.s, z6.b, z1.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
       "udot z9.s, z7.b, z0.b\n"
       "udot z13.s, z7.b, z1.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
-      "udot z10.s, z6.b, z0.b\n"
-      "udot z14.s, z6.b, z1.b\n"
+      "udot z10.s, z17.b, z0.b\n"
+      "udot z14.s, z17.b, z1.b\n"
       "addvl x10, x10, #4\n"
-      "udot z11.s, z7.b, z0.b\n"
-      "udot z15.s, z7.b, z1.b\n"
+      "udot z11.s, z16.b, z0.b\n"
+      "udot z15.s, z16.b, z1.b\n"
       "bne 15b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
       "st1w { z8.s }, p3, [x9]\n"
       "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p3, [x24]\n"
-      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
+      "st1w { z12.s }, p3, [x20]\n"
+      "st1w { z13.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x20, #3, MUL VL]\n"
       "20:"  // Height 2: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -295,20 +295,20 @@
       "whilelt p0.s, x20, x11\n"
       "tbz %x[flags], #0, 23f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z8.s }, p3/Z, [x9]\n"
       "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x24]\n"
-      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p3/Z, [x23]\n"
-      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x21]\n"
+      "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x20]\n"
+      "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 24f\n"
       "23:"  // Height 3: no accumulate
       "mov z8.s, #0x0\n"
@@ -328,13 +328,13 @@
       "25:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 26f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 27f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -343,8 +343,8 @@
       "b 27f\n"
       "26:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "27:"  // Height 3: input setup done
       "subs x27, x27, #0x4\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -360,21 +360,21 @@
       "subs x27, x27, #0x4\n"
       "udot z16.s, z6.b, z2.b\n"
       "udot z9.s, z7.b, z0.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x4\n"
       "udot z13.s, z7.b, z1.b\n"
       "udot z17.s, z7.b, z2.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
       "add x24, x24, #0x4\n"
-      "udot z10.s, z6.b, z0.b\n"
-      "udot z14.s, z6.b, z1.b\n"
-      "udot z18.s, z6.b, z2.b\n"
-      "udot z11.s, z7.b, z0.b\n"
+      "udot z10.s, z21.b, z0.b\n"
+      "udot z14.s, z21.b, z1.b\n"
+      "udot z18.s, z21.b, z2.b\n"
+      "udot z11.s, z20.b, z0.b\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1b { z6.b }, p4/Z, [x10]\n"
-      "udot z15.s, z7.b, z1.b\n"
-      "udot z19.s, z7.b, z2.b\n"
+      "udot z15.s, z20.b, z1.b\n"
+      "udot z19.s, z20.b, z2.b\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
       "ld1rw { z2.s }, p4/Z, [x24]\n"
       "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -386,35 +386,35 @@
       "add x28, x28, #0x1\n"
       "udot z16.s, z6.b, z2.b\n"
       "udot z9.s, z7.b, z0.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
       "cmp x28, x20\n"
       "udot z13.s, z7.b, z1.b\n"
       "udot z17.s, z7.b, z2.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b\n"
-      "udot z14.s, z6.b, z1.b\n"
-      "udot z18.s, z6.b, z2.b\n"
-      "udot z11.s, z7.b, z0.b\n"
-      "udot z15.s, z7.b, z1.b\n"
-      "udot z19.s, z7.b, z2.b\n"
+      "udot z10.s, z21.b, z0.b\n"
+      "udot z14.s, z21.b, z1.b\n"
+      "udot z18.s, z21.b, z2.b\n"
+      "udot z11.s, z20.b, z0.b\n"
+      "udot z15.s, z20.b, z1.b\n"
+      "udot z19.s, z20.b, z2.b\n"
       "bne 25b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "st1w { z8.s }, p3, [x9]\n"
       "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p3, [x24]\n"
-      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p3, [x23]\n"
-      "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z12.s }, p3, [x21]\n"
+      "st1w { z13.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x20]\n"
+      "st1w { z17.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x20, #3, MUL VL]\n"
       "30:"  // Height 3: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -435,25 +435,25 @@
       "whilelt p0.s, x20, x11\n"
       "tbz %x[flags], #0, 33f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z8.s }, p3/Z, [x9]\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x24]\n"
-      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p3/Z, [x23]\n"
-      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p3/Z, [x22]\n"
-      "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x22]\n"
+      "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x21]\n"
+      "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x20]\n"
+      "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 34f\n"
       "33:"  // Height 4: no accumulate
       "mov z8.s, #0x0\n"
@@ -477,14 +477,14 @@
       "35:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 36f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 37f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -494,9 +494,9 @@
       "b 37f\n"
       "36:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "37:"  // Height 4: input setup done
       "subs x27, x27, #0x4\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -513,7 +513,7 @@
       "subs x27, x27, #0x4\n"
       "udot z16.s, z6.b, z2.b\n"
       "udot z20.s, z6.b, z3.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x4\n"
       "udot z9.s, z7.b, z0.b\n"
       "udot z13.s, z7.b, z1.b\n"
@@ -521,19 +521,19 @@
       "add x23, x23, #0x4\n"
       "udot z17.s, z7.b, z2.b\n"
       "udot z21.s, z7.b, z3.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b\n"
-      "udot z14.s, z6.b, z1.b\n"
-      "udot z18.s, z6.b, z2.b\n"
-      "udot z22.s, z6.b, z3.b\n"
+      "udot z10.s, z25.b, z0.b\n"
+      "udot z14.s, z25.b, z1.b\n"
+      "udot z18.s, z25.b, z2.b\n"
+      "udot z22.s, z25.b, z3.b\n"
       "ld1b { z6.b }, p4/Z, [x10]\n"
-      "udot z11.s, z7.b, z0.b\n"
-      "udot z15.s, z7.b, z1.b\n"
+      "udot z11.s, z24.b, z0.b\n"
+      "udot z15.s, z24.b, z1.b\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
-      "udot z19.s, z7.b, z2.b\n"
-      "udot z23.s, z7.b, z3.b\n"
+      "udot z19.s, z24.b, z2.b\n"
+      "udot z23.s, z24.b, z3.b\n"
       "ld1rw { z2.s }, p4/Z, [x24]\n"
       "ld1rw { z3.s }, p4/Z, [x23]\n"
       "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -545,44 +545,44 @@
       "add x28, x28, #0x1\n"
       "udot z16.s, z6.b, z2.b\n"
       "udot z20.s, z6.b, z3.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
       "cmp x28, x20\n"
       "udot z9.s, z7.b, z0.b\n"
       "udot z13.s, z7.b, z1.b\n"
       "udot z17.s, z7.b, z2.b\n"
       "udot z21.s, z7.b, z3.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b\n"
-      "udot z14.s, z6.b, z1.b\n"
-      "udot z18.s, z6.b, z2.b\n"
-      "udot z22.s, z6.b, z3.b\n"
-      "udot z11.s, z7.b, z0.b\n"
-      "udot z15.s, z7.b, z1.b\n"
-      "udot z19.s, z7.b, z2.b\n"
-      "udot z23.s, z7.b, z3.b\n"
+      "udot z10.s, z25.b, z0.b\n"
+      "udot z14.s, z25.b, z1.b\n"
+      "udot z18.s, z25.b, z2.b\n"
+      "udot z22.s, z25.b, z3.b\n"
+      "udot z11.s, z24.b, z0.b\n"
+      "udot z15.s, z24.b, z1.b\n"
+      "udot z19.s, z24.b, z2.b\n"
+      "udot z23.s, z24.b, z3.b\n"
       "bne 35b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "st1w { z8.s }, p3, [x9]\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p3, [x24]\n"
-      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p3, [x23]\n"
-      "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
-      "st1w { z20.s }, p3, [x22]\n"
-      "st1w { z21.s }, p2, [x22, #1, MUL VL]\n"
-      "st1w { z22.s }, p1, [x22, #2, MUL VL]\n"
-      "st1w { z23.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z12.s }, p3, [x22]\n"
+      "st1w { z13.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x21]\n"
+      "st1w { z17.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x20]\n"
+      "st1w { z21.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x20, #3, MUL VL]\n"
       "40:"  // Height 4: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -603,30 +603,30 @@
       "whilelt p0.s, x20, x11\n"
       "tbz %x[flags], #0, 43f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z8.s }, p3/Z, [x9]\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x9]\n"
       "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p3/Z, [x24]\n"
-      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p3/Z, [x23]\n"
-      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p3/Z, [x22]\n"
-      "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z24.s }, p3/Z, [x21]\n"
-      "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x20]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
       "b 44f\n"
       "43:"  // Height 5: no accumulate
       "mov z8.s, #0x0\n"
@@ -654,15 +654,15 @@
       "45:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 46f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -673,10 +673,10 @@
       "b 47f\n"
       "46:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "47:"  // Height 5: input setup done
       "subs x27, x27, #0x4\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -698,29 +698,29 @@
       "add x24, x24, #0x4\n"
       "udot z24.s, z6.b, z4.b\n"
       "udot z9.s, z7.b, z0.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
       "add x23, x23, #0x4\n"
       "udot z13.s, z7.b, z1.b\n"
       "udot z17.s, z7.b, z2.b\n"
       "add x22, x22, #0x4\n"
       "udot z21.s, z7.b, z3.b\n"
       "udot z25.s, z7.b, z4.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b\n"
-      "udot z14.s, z6.b, z1.b\n"
-      "udot z18.s, z6.b, z2.b\n"
-      "udot z22.s, z6.b, z3.b\n"
-      "udot z26.s, z6.b, z4.b\n"
-      "udot z11.s, z7.b, z0.b\n"
+      "udot z10.s, z29.b, z0.b\n"
+      "udot z14.s, z29.b, z1.b\n"
+      "udot z18.s, z29.b, z2.b\n"
+      "udot z22.s, z29.b, z3.b\n"
+      "udot z26.s, z29.b, z4.b\n"
+      "udot z11.s, z28.b, z0.b\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
       "ld1b { z6.b }, p4/Z, [x10]\n"
-      "udot z15.s, z7.b, z1.b\n"
-      "udot z19.s, z7.b, z2.b\n"
+      "udot z15.s, z28.b, z1.b\n"
+      "udot z19.s, z28.b, z2.b\n"
       "ld1rw { z1.s }, p4/Z, [x25]\n"
       "ld1rw { z2.s }, p4/Z, [x24]\n"
-      "udot z23.s, z7.b, z3.b\n"
-      "udot z27.s, z7.b, z4.b\n"
+      "udot z23.s, z28.b, z3.b\n"
+      "udot z27.s, z28.b, z4.b\n"
       "ld1rw { z3.s }, p4/Z, [x23]\n"
       "ld1rw { z4.s }, p4/Z, [x22]\n"
       "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
@@ -735,50 +735,50 @@
       "cmp x28, x20\n"
       "udot z24.s, z6.b, z4.b\n"
       "udot z9.s, z7.b, z0.b\n"
-      "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
       "udot z13.s, z7.b, z1.b\n"
       "udot z17.s, z7.b, z2.b\n"
       "udot z21.s, z7.b, z3.b\n"
       "udot z25.s, z7.b, z4.b\n"
-      "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b\n"
-      "udot z14.s, z6.b, z1.b\n"
-      "udot z18.s, z6.b, z2.b\n"
-      "udot z22.s, z6.b, z3.b\n"
-      "udot z26.s, z6.b, z4.b\n"
-      "udot z11.s, z7.b, z0.b\n"
-      "udot z15.s, z7.b, z1.b\n"
-      "udot z19.s, z7.b, z2.b\n"
-      "udot z23.s, z7.b, z3.b\n"
-      "udot z27.s, z7.b, z4.b\n"
+      "udot z10.s, z29.b, z0.b\n"
+      "udot z14.s, z29.b, z1.b\n"
+      "udot z18.s, z29.b, z2.b\n"
+      "udot z22.s, z29.b, z3.b\n"
+      "udot z26.s, z29.b, z4.b\n"
+      "udot z11.s, z28.b, z0.b\n"
+      "udot z15.s, z28.b, z1.b\n"
+      "udot z19.s, z28.b, z2.b\n"
+      "udot z23.s, z28.b, z3.b\n"
+      "udot z27.s, z28.b, z4.b\n"
       "bne 45b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "st1w { z8.s }, p3, [x9]\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "st1w { z8.s }, p3, [x9]\n"
       "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p3, [x24]\n"
-      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p3, [x23]\n"
-      "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
-      "st1w { z20.s }, p3, [x22]\n"
-      "st1w { z21.s }, p2, [x22, #1, MUL VL]\n"
-      "st1w { z22.s }, p1, [x22, #2, MUL VL]\n"
-      "st1w { z23.s }, p0, [x22, #3, MUL VL]\n"
-      "st1w { z24.s }, p3, [x21]\n"
-      "st1w { z25.s }, p2, [x21, #1, MUL VL]\n"
-      "st1w { z26.s }, p1, [x21, #2, MUL VL]\n"
-      "st1w { z27.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z12.s }, p3, [x23]\n"
+      "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x22]\n"
+      "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x21]\n"
+      "st1w { z21.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x20]\n"
+      "st1w { z25.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x20, #3, MUL VL]\n"
       "50:"  // Height 5: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -862,16 +862,16 @@
       "55:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 56f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 57f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -883,11 +883,11 @@
       "b 57f\n"
       "56:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "57:"  // Height 6: input setup done
       "subs x27, x27, #0x4\n"
       "ld1rw { z0.s }, p4/Z, [x26]\n"
@@ -1022,7 +1022,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "62:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1030,4 +1029,4 @@
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
index 30a108a..7871c0b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
@@ -115,11 +115,11 @@
       "5:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 6f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 7f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -132,87 +132,87 @@
       "8:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10]\n"
+      "udot z8.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z9.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z10.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z11.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "udot z8.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "udot z9.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "udot z10.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "udot z11.s, z16.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[2]\n"
+      "udot z9.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "udot z10.s, z17.b, z0.b[2]\n"
+      "udot z11.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[3]\n"
+      "udot z9.s, z16.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
       "sub x27, x27, #0x10\n"
       "cmp x27, #0x10\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z10.s, z17.b, z0.b[3]\n"
+      "udot z11.s, z16.b, z0.b[3]\n"
       "add x26, x26, #0x10\n"
       "bgt 8b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10]\n"
+      "udot z8.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z9.s, z16.b, z0.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z17.b, z0.b[0]\n"
+      "udot z11.s, z16.b, z0.b[0]\n"
       "addvl x10, x10, #4\n"
       "ble 10f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[1]\n"
+      "udot z9.s, z16.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z10.s, z17.b, z0.b[1]\n"
+      "udot z11.s, z16.b, z0.b[1]\n"
       "addvl x10, x10, #4\n"
       "ble 10f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[2]\n"
+      "udot z9.s, z16.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z10.s, z17.b, z0.b[2]\n"
+      "udot z11.s, z16.b, z0.b[2]\n"
       "addvl x10, x10, #4\n"
       "ble 10f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[3]\n"
+      "udot z9.s, z16.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z17.b, z0.b[3]\n"
+      "udot z11.s, z16.b, z0.b[3]\n"
       "addvl x10, x10, #4\n"
       "10:"  // Height 1: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -244,15 +244,15 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 14f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x9]\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 15f\n"
       "14:"  // Height 2: no accumulate
       "mov z8.s, #0x0\n"
@@ -268,12 +268,12 @@
       "16:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 17f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 18f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -281,146 +281,146 @@
       "b 18f\n"
       "17:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
+      "add x25, x26, x21\n"
       "18:"  // Height 2: input setup done
       "cmp x27, #0x10\n"
       "ble 20f\n"
       "19:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z1.b }, p0/Z, [x26]\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z1.b[0]\n"
+      "udot z12.s, z17.b, z0.b[0]\n"
+      "udot z9.s, z16.b, z1.b[0]\n"
+      "udot z13.s, z16.b, z0.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z17.b, z1.b[0]\n"
+      "udot z14.s, z17.b, z0.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
       "cmp x27, #0x10\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "udot z11.s, z16.b, z1.b[0]\n"
+      "udot z15.s, z16.b, z0.b[0]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "udot z8.s, z17.b, z1.b[1]\n"
+      "udot z12.s, z17.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "udot z9.s, z16.b, z1.b[1]\n"
+      "udot z13.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z10.s, z17.b, z1.b[1]\n"
+      "udot z14.s, z17.b, z0.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "udot z11.s, z16.b, z1.b[1]\n"
+      "udot z15.s, z16.b, z0.b[1]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "udot z8.s, z17.b, z1.b[2]\n"
+      "udot z12.s, z17.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "udot z9.s, z16.b, z1.b[2]\n"
+      "udot z13.s, z16.b, z0.b[2]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "udot z10.s, z17.b, z1.b[2]\n"
+      "udot z14.s, z17.b, z0.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "udot z11.s, z16.b, z1.b[2]\n"
+      "udot z15.s, z16.b, z0.b[2]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "udot z8.s, z17.b, z1.b[3]\n"
+      "udot z12.s, z17.b, z0.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "udot z9.s, z16.b, z1.b[3]\n"
+      "udot z13.s, z16.b, z0.b[3]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "udot z10.s, z17.b, z1.b[3]\n"
+      "udot z14.s, z17.b, z0.b[3]\n"
+      "udot z11.s, z16.b, z1.b[3]\n"
+      "udot z15.s, z16.b, z0.b[3]\n"
       "bgt 19b\n"
       "20:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z0.b }, p0/Z, [x26]\n"
       "ld1rqb { z1.b }, p0/Z, [x25]\n"
       "subs x27, x27, #0x4\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[0]\n"
+      "udot z12.s, z17.b, z1.b[0]\n"
+      "udot z9.s, z16.b, z0.b[0]\n"
+      "udot z13.s, z16.b, z1.b[0]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z17.b, z0.b[0]\n"
+      "udot z14.s, z17.b, z1.b[0]\n"
       "addvl x10, x10, #4\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z11.s, z16.b, z0.b[0]\n"
+      "udot z15.s, z16.b, z1.b[0]\n"
       "ble 21f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[1]\n"
+      "udot z12.s, z17.b, z1.b[1]\n"
+      "udot z9.s, z16.b, z0.b[1]\n"
+      "udot z13.s, z16.b, z1.b[1]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z10.s, z17.b, z0.b[1]\n"
+      "udot z14.s, z17.b, z1.b[1]\n"
       "addvl x10, x10, #4\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z11.s, z16.b, z0.b[1]\n"
+      "udot z15.s, z16.b, z1.b[1]\n"
       "ble 21f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[2]\n"
+      "udot z12.s, z17.b, z1.b[2]\n"
+      "udot z9.s, z16.b, z0.b[2]\n"
+      "udot z13.s, z16.b, z1.b[2]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z10.s, z17.b, z0.b[2]\n"
+      "udot z14.s, z17.b, z1.b[2]\n"
       "addvl x10, x10, #4\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z11.s, z16.b, z0.b[2]\n"
+      "udot z15.s, z16.b, z1.b[2]\n"
       "ble 21f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z17.b, z0.b[3]\n"
+      "udot z12.s, z17.b, z1.b[3]\n"
+      "udot z9.s, z16.b, z0.b[3]\n"
+      "udot z13.s, z16.b, z1.b[3]\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z17.b, z0.b[3]\n"
+      "udot z14.s, z17.b, z1.b[3]\n"
       "addvl x10, x10, #4\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z11.s, z16.b, z0.b[3]\n"
+      "udot z15.s, z16.b, z1.b[3]\n"
       "21:"  // Height 2: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 16b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
+      "add x20, x9, x20, LSL #2\n"
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z12.s }, p4, [x20]\n"
+      "st1w { z13.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x20, #3, MUL VL]\n"
       "22:"  // Height 2: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -441,20 +441,20 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 25f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x9]\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23]\n"
-      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x20]\n"
+      "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 26f\n"
       "25:"  // Height 3: no accumulate
       "mov z8.s, #0x0\n"
@@ -474,13 +474,13 @@
       "27:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 28f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 29f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -489,86 +489,86 @@
       "b 29f\n"
       "28:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "29:"  // Height 3: input setup done
       "cmp x27, #0x10\n"
       "ble 31f\n"
       "30:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
       "ld1rqb { z1.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
-      "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x24]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "udot z8.s, z21.b, z2.b[0]\n"
+      "udot z12.s, z21.b, z1.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z16.s, z21.b, z0.b[0]\n"
+      "udot z9.s, z20.b, z2.b[0]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z20.b, z1.b[0]\n"
+      "udot z17.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
       "cmp x27, #0x10\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z10.s, z21.b, z2.b[0]\n"
+      "udot z14.s, z21.b, z1.b[0]\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "udot z18.s, z21.b, z0.b[0]\n"
+      "udot z11.s, z20.b, z2.b[0]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "udot z15.s, z20.b, z1.b[0]\n"
+      "udot z19.s, z20.b, z0.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "udot z8.s, z21.b, z2.b[1]\n"
+      "udot z12.s, z21.b, z1.b[1]\n"
+      "udot z16.s, z21.b, z0.b[1]\n"
+      "udot z9.s, z20.b, z2.b[1]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "udot z13.s, z20.b, z1.b[1]\n"
+      "udot z17.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z10.s, z21.b, z2.b[1]\n"
+      "udot z14.s, z21.b, z1.b[1]\n"
+      "udot z18.s, z21.b, z0.b[1]\n"
+      "udot z11.s, z20.b, z2.b[1]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "udot z15.s, z20.b, z1.b[1]\n"
+      "udot z19.s, z20.b, z0.b[1]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "udot z8.s, z21.b, z2.b[2]\n"
+      "udot z12.s, z21.b, z1.b[2]\n"
+      "udot z16.s, z21.b, z0.b[2]\n"
+      "udot z9.s, z20.b, z2.b[2]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "udot z13.s, z20.b, z1.b[2]\n"
+      "udot z17.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "udot z10.s, z21.b, z2.b[2]\n"
+      "udot z14.s, z21.b, z1.b[2]\n"
+      "udot z18.s, z21.b, z0.b[2]\n"
+      "udot z11.s, z20.b, z2.b[2]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "udot z15.s, z20.b, z1.b[2]\n"
+      "udot z19.s, z20.b, z0.b[2]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "udot z8.s, z21.b, z2.b[3]\n"
+      "udot z12.s, z21.b, z1.b[3]\n"
+      "udot z16.s, z21.b, z0.b[3]\n"
+      "udot z9.s, z20.b, z2.b[3]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "udot z13.s, z20.b, z1.b[3]\n"
+      "udot z17.s, z20.b, z0.b[3]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "udot z10.s, z21.b, z2.b[3]\n"
+      "udot z14.s, z21.b, z1.b[3]\n"
+      "udot z18.s, z21.b, z0.b[3]\n"
+      "udot z11.s, z20.b, z2.b[3]\n"
+      "udot z15.s, z20.b, z1.b[3]\n"
+      "udot z19.s, z20.b, z0.b[3]\n"
       "bgt 30b\n"
       "31:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
@@ -576,100 +576,100 @@
       "ld1rqb { z1.b }, p0/Z, [x25]\n"
       "subs x27, x27, #0x4\n"
       "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "udot z8.s, z21.b, z0.b[0]\n"
+      "udot z12.s, z21.b, z1.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z16.s, z21.b, z2.b[0]\n"
+      "udot z9.s, z20.b, z0.b[0]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z20.b, z1.b[0]\n"
+      "udot z17.s, z20.b, z2.b[0]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
+      "udot z10.s, z21.b, z0.b[0]\n"
+      "udot z14.s, z21.b, z1.b[0]\n"
+      "udot z18.s, z21.b, z2.b[0]\n"
+      "udot z11.s, z20.b, z0.b[0]\n"
+      "udot z15.s, z20.b, z1.b[0]\n"
+      "udot z19.s, z20.b, z2.b[0]\n"
       "ble 32f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z21.b, z0.b[1]\n"
+      "udot z12.s, z21.b, z1.b[1]\n"
+      "udot z16.s, z21.b, z2.b[1]\n"
+      "udot z9.s, z20.b, z0.b[1]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z13.s, z20.b, z1.b[1]\n"
+      "udot z17.s, z20.b, z2.b[1]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
+      "udot z10.s, z21.b, z0.b[1]\n"
+      "udot z14.s, z21.b, z1.b[1]\n"
+      "udot z18.s, z21.b, z2.b[1]\n"
+      "udot z11.s, z20.b, z0.b[1]\n"
+      "udot z15.s, z20.b, z1.b[1]\n"
+      "udot z19.s, z20.b, z2.b[1]\n"
       "ble 32f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z21.b, z0.b[2]\n"
+      "udot z12.s, z21.b, z1.b[2]\n"
+      "udot z16.s, z21.b, z2.b[2]\n"
+      "udot z9.s, z20.b, z0.b[2]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z13.s, z20.b, z1.b[2]\n"
+      "udot z17.s, z20.b, z2.b[2]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
+      "udot z10.s, z21.b, z0.b[2]\n"
+      "udot z14.s, z21.b, z1.b[2]\n"
+      "udot z18.s, z21.b, z2.b[2]\n"
+      "udot z11.s, z20.b, z0.b[2]\n"
+      "udot z15.s, z20.b, z1.b[2]\n"
+      "udot z19.s, z20.b, z2.b[2]\n"
       "ble 32f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z21.b }, p5/Z, [x10]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z21.b, z0.b[3]\n"
+      "udot z12.s, z21.b, z1.b[3]\n"
+      "udot z16.s, z21.b, z2.b[3]\n"
+      "udot z9.s, z20.b, z0.b[3]\n"
+      "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z20.b, z1.b[3]\n"
+      "udot z17.s, z20.b, z2.b[3]\n"
+      "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z10.s, z21.b, z0.b[3]\n"
+      "udot z14.s, z21.b, z1.b[3]\n"
+      "udot z18.s, z21.b, z2.b[3]\n"
+      "udot z11.s, z20.b, z0.b[3]\n"
+      "udot z15.s, z20.b, z1.b[3]\n"
+      "udot z19.s, z20.b, z2.b[3]\n"
       "32:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 27b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "st1w { z8.s }, p4, [x9]\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z12.s }, p4, [x21]\n"
+      "st1w { z13.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x20]\n"
+      "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
       "33:"  // Height 3: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -690,25 +690,25 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 36f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "ld1w { z8.s }, p4/Z, [x9]\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23]\n"
-      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x22]\n"
-      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x21]\n"
+      "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 37f\n"
       "36:"  // Height 4: no accumulate
       "mov z8.s, #0x0\n"
@@ -732,14 +732,14 @@
       "38:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 39f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 40f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -749,105 +749,105 @@
       "b 40f\n"
       "39:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "40:"  // Height 4: input setup done
       "cmp x27, #0x10\n"
       "ble 42f\n"
       "41:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z3.b }, p0/Z, [x26]\n"
+      "ld1rqb { z2.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
-      "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "ld1rqb { z0.b }, p0/Z, [x23]\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "udot z20.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z25.b, z3.b[0]\n"
+      "udot z12.s, z25.b, z2.b[0]\n"
+      "udot z16.s, z25.b, z1.b[0]\n"
+      "udot z20.s, z25.b, z0.b[0]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
+      "udot z9.s, z24.b, z3.b[0]\n"
+      "udot z13.s, z24.b, z2.b[0]\n"
       "add x24, x24, #0x10\n"
       "add x23, x23, #0x10\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "udot z21.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z22.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
-      "udot z23.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "udot z20.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "udot z21.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "udot z17.s, z24.b, z1.b[0]\n"
+      "udot z21.s, z24.b, z0.b[0]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z25.b, z3.b[0]\n"
+      "udot z14.s, z25.b, z2.b[0]\n"
+      "udot z18.s, z25.b, z1.b[0]\n"
+      "udot z22.s, z25.b, z0.b[0]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "udot z11.s, z24.b, z3.b[0]\n"
+      "udot z15.s, z24.b, z2.b[0]\n"
+      "udot z19.s, z24.b, z1.b[0]\n"
+      "udot z23.s, z24.b, z0.b[0]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "udot z8.s, z25.b, z3.b[1]\n"
+      "udot z12.s, z25.b, z2.b[1]\n"
+      "udot z16.s, z25.b, z1.b[1]\n"
+      "udot z20.s, z25.b, z0.b[1]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "udot z9.s, z24.b, z3.b[1]\n"
+      "udot z13.s, z24.b, z2.b[1]\n"
+      "udot z17.s, z24.b, z1.b[1]\n"
+      "udot z21.s, z24.b, z0.b[1]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z22.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
-      "udot z23.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z20.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "udot z21.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z22.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
-      "udot z23.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "udot z20.s, z6.b, z3.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "udot z21.s, z7.b, z3.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z22.s, z6.b, z3.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
-      "udot z23.s, z7.b, z3.b[3]\n"
+      "udot z10.s, z25.b, z3.b[1]\n"
+      "udot z14.s, z25.b, z2.b[1]\n"
+      "udot z18.s, z25.b, z1.b[1]\n"
+      "udot z22.s, z25.b, z0.b[1]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "udot z11.s, z24.b, z3.b[1]\n"
+      "udot z15.s, z24.b, z2.b[1]\n"
+      "udot z19.s, z24.b, z1.b[1]\n"
+      "udot z23.s, z24.b, z0.b[1]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "udot z8.s, z25.b, z3.b[2]\n"
+      "udot z12.s, z25.b, z2.b[2]\n"
+      "udot z16.s, z25.b, z1.b[2]\n"
+      "udot z20.s, z25.b, z0.b[2]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "udot z9.s, z24.b, z3.b[2]\n"
+      "udot z13.s, z24.b, z2.b[2]\n"
+      "udot z17.s, z24.b, z1.b[2]\n"
+      "udot z21.s, z24.b, z0.b[2]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "udot z10.s, z25.b, z3.b[2]\n"
+      "udot z14.s, z25.b, z2.b[2]\n"
+      "udot z18.s, z25.b, z1.b[2]\n"
+      "udot z22.s, z25.b, z0.b[2]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "udot z11.s, z24.b, z3.b[2]\n"
+      "udot z15.s, z24.b, z2.b[2]\n"
+      "udot z19.s, z24.b, z1.b[2]\n"
+      "udot z23.s, z24.b, z0.b[2]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "udot z8.s, z25.b, z3.b[3]\n"
+      "udot z12.s, z25.b, z2.b[3]\n"
+      "udot z16.s, z25.b, z1.b[3]\n"
+      "udot z20.s, z25.b, z0.b[3]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "udot z9.s, z24.b, z3.b[3]\n"
+      "udot z13.s, z24.b, z2.b[3]\n"
+      "udot z17.s, z24.b, z1.b[3]\n"
+      "udot z21.s, z24.b, z0.b[3]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "udot z10.s, z25.b, z3.b[3]\n"
+      "udot z14.s, z25.b, z2.b[3]\n"
+      "udot z18.s, z25.b, z1.b[3]\n"
+      "udot z22.s, z25.b, z0.b[3]\n"
+      "udot z11.s, z24.b, z3.b[3]\n"
+      "udot z15.s, z24.b, z2.b[3]\n"
+      "udot z19.s, z24.b, z1.b[3]\n"
+      "udot z23.s, z24.b, z0.b[3]\n"
       "bgt 41b\n"
       "42:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
@@ -856,121 +856,121 @@
       "subs x27, x27, #0x4\n"
       "ld1rqb { z2.b }, p0/Z, [x24]\n"
       "ld1rqb { z3.b }, p0/Z, [x23]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "udot z20.s, z6.b, z3.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "udot z21.s, z7.b, z3.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z25.b, z0.b[0]\n"
+      "udot z12.s, z25.b, z1.b[0]\n"
+      "udot z16.s, z25.b, z2.b[0]\n"
+      "udot z20.s, z25.b, z3.b[0]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z9.s, z24.b, z0.b[0]\n"
+      "udot z13.s, z24.b, z1.b[0]\n"
+      "udot z17.s, z24.b, z2.b[0]\n"
+      "udot z21.s, z24.b, z3.b[0]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z22.s, z6.b, z3.b[0]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
-      "udot z23.s, z7.b, z3.b[0]\n"
+      "udot z10.s, z25.b, z0.b[0]\n"
+      "udot z14.s, z25.b, z1.b[0]\n"
+      "udot z18.s, z25.b, z2.b[0]\n"
+      "udot z22.s, z25.b, z3.b[0]\n"
+      "udot z11.s, z24.b, z0.b[0]\n"
+      "udot z15.s, z24.b, z1.b[0]\n"
+      "udot z19.s, z24.b, z2.b[0]\n"
+      "udot z23.s, z24.b, z3.b[0]\n"
       "ble 43f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "udot z20.s, z6.b, z3.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z25.b, z0.b[1]\n"
+      "udot z12.s, z25.b, z1.b[1]\n"
+      "udot z16.s, z25.b, z2.b[1]\n"
+      "udot z20.s, z25.b, z3.b[1]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "udot z21.s, z7.b, z3.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z9.s, z24.b, z0.b[1]\n"
+      "udot z13.s, z24.b, z1.b[1]\n"
+      "udot z17.s, z24.b, z2.b[1]\n"
+      "udot z21.s, z24.b, z3.b[1]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z22.s, z6.b, z3.b[1]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
-      "udot z23.s, z7.b, z3.b[1]\n"
+      "udot z10.s, z25.b, z0.b[1]\n"
+      "udot z14.s, z25.b, z1.b[1]\n"
+      "udot z18.s, z25.b, z2.b[1]\n"
+      "udot z22.s, z25.b, z3.b[1]\n"
+      "udot z11.s, z24.b, z0.b[1]\n"
+      "udot z15.s, z24.b, z1.b[1]\n"
+      "udot z19.s, z24.b, z2.b[1]\n"
+      "udot z23.s, z24.b, z3.b[1]\n"
       "ble 43f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z20.s, z6.b, z3.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z25.b, z0.b[2]\n"
+      "udot z12.s, z25.b, z1.b[2]\n"
+      "udot z16.s, z25.b, z2.b[2]\n"
+      "udot z20.s, z25.b, z3.b[2]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x4\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "udot z21.s, z7.b, z3.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z9.s, z24.b, z0.b[2]\n"
+      "udot z13.s, z24.b, z1.b[2]\n"
+      "udot z17.s, z24.b, z2.b[2]\n"
+      "udot z21.s, z24.b, z3.b[2]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z22.s, z6.b, z3.b[2]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
-      "udot z23.s, z7.b, z3.b[2]\n"
+      "udot z10.s, z25.b, z0.b[2]\n"
+      "udot z14.s, z25.b, z1.b[2]\n"
+      "udot z18.s, z25.b, z2.b[2]\n"
+      "udot z22.s, z25.b, z3.b[2]\n"
+      "udot z11.s, z24.b, z0.b[2]\n"
+      "udot z15.s, z24.b, z1.b[2]\n"
+      "udot z19.s, z24.b, z2.b[2]\n"
+      "udot z23.s, z24.b, z3.b[2]\n"
       "ble 43f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "udot z20.s, z6.b, z3.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "udot z21.s, z7.b, z3.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z25.b, z0.b[3]\n"
+      "udot z12.s, z25.b, z1.b[3]\n"
+      "udot z16.s, z25.b, z2.b[3]\n"
+      "udot z20.s, z25.b, z3.b[3]\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z9.s, z24.b, z0.b[3]\n"
+      "udot z13.s, z24.b, z1.b[3]\n"
+      "udot z17.s, z24.b, z2.b[3]\n"
+      "udot z21.s, z24.b, z3.b[3]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z22.s, z6.b, z3.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
-      "udot z23.s, z7.b, z3.b[3]\n"
+      "udot z10.s, z25.b, z0.b[3]\n"
+      "udot z14.s, z25.b, z1.b[3]\n"
+      "udot z18.s, z25.b, z2.b[3]\n"
+      "udot z22.s, z25.b, z3.b[3]\n"
+      "udot z11.s, z24.b, z0.b[3]\n"
+      "udot z15.s, z24.b, z1.b[3]\n"
+      "udot z19.s, z24.b, z2.b[3]\n"
+      "udot z23.s, z24.b, z3.b[3]\n"
       "43:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 38b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
       "st1w { z8.s }, p4, [x9]\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z20.s }, p4, [x22]\n"
-      "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z12.s }, p4, [x22]\n"
+      "st1w { z13.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x21]\n"
+      "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x20]\n"
+      "st1w { z21.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x20, #3, MUL VL]\n"
       "44:"  // Height 4: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -991,30 +991,30 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z8.s }, p4/Z, [x9]\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "ld1w { z8.s }, p4/Z, [x9]\n"
       "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
       "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z16.s }, p4/Z, [x23]\n"
-      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x22]\n"
-      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z24.s }, p4/Z, [x21]\n"
-      "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
-      "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x22]\n"
+      "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x20]\n"
+      "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
       "b 48f\n"
       "47:"  // Height 5: no accumulate
       "mov z8.s, #0x0\n"
@@ -1042,15 +1042,15 @@
       "49:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 50f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 51f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1061,124 +1061,124 @@
       "b 51f\n"
       "50:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "51:"  // Height 5: input setup done
       "cmp x27, #0x10\n"
       "ble 53f\n"
       "52:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z4.b }, p0/Z, [x26]\n"
+      "ld1rqb { z3.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
       "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "ld1rqb { z4.b }, p0/Z, [x22]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "udot z20.s, z6.b, z3.b[0]\n"
+      "ld1rqb { z0.b }, p0/Z, [x22]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "udot z8.s, z29.b, z4.b[0]\n"
+      "udot z12.s, z29.b, z3.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z16.s, z29.b, z2.b[0]\n"
+      "udot z20.s, z29.b, z1.b[0]\n"
       "add x25, x25, #0x10\n"
-      "udot z24.s, z6.b, z4.b[0]\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z24.s, z29.b, z0.b[0]\n"
+      "udot z9.s, z28.b, z4.b[0]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
+      "udot z13.s, z28.b, z3.b[0]\n"
+      "udot z17.s, z28.b, z2.b[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "udot z21.s, z7.b, z3.b[0]\n"
-      "udot z25.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z22.s, z6.b, z3.b[0]\n"
-      "udot z26.s, z6.b, z4.b[0]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
-      "udot z23.s, z7.b, z3.b[0]\n"
-      "udot z27.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "udot z20.s, z6.b, z3.b[1]\n"
-      "udot z24.s, z6.b, z4.b[1]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "udot z21.s, z7.b, z3.b[1]\n"
-      "udot z25.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "udot z21.s, z28.b, z1.b[0]\n"
+      "udot z25.s, z28.b, z0.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z29.b, z4.b[0]\n"
+      "udot z14.s, z29.b, z3.b[0]\n"
+      "udot z18.s, z29.b, z2.b[0]\n"
+      "udot z22.s, z29.b, z1.b[0]\n"
+      "udot z26.s, z29.b, z0.b[0]\n"
+      "udot z11.s, z28.b, z4.b[0]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "udot z15.s, z28.b, z3.b[0]\n"
+      "udot z19.s, z28.b, z2.b[0]\n"
+      "udot z23.s, z28.b, z1.b[0]\n"
+      "udot z27.s, z28.b, z0.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "udot z8.s, z29.b, z4.b[1]\n"
+      "udot z12.s, z29.b, z3.b[1]\n"
+      "udot z16.s, z29.b, z2.b[1]\n"
+      "udot z20.s, z29.b, z1.b[1]\n"
+      "udot z24.s, z29.b, z0.b[1]\n"
+      "udot z9.s, z28.b, z4.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "udot z13.s, z28.b, z3.b[1]\n"
+      "udot z17.s, z28.b, z2.b[1]\n"
+      "udot z21.s, z28.b, z1.b[1]\n"
+      "udot z25.s, z28.b, z0.b[1]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z22.s, z6.b, z3.b[1]\n"
-      "udot z26.s, z6.b, z4.b[1]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
-      "udot z23.s, z7.b, z3.b[1]\n"
-      "udot z27.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z20.s, z6.b, z3.b[2]\n"
-      "udot z24.s, z6.b, z4.b[2]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "udot z21.s, z7.b, z3.b[2]\n"
-      "udot z25.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z22.s, z6.b, z3.b[2]\n"
-      "udot z26.s, z6.b, z4.b[2]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
-      "udot z23.s, z7.b, z3.b[2]\n"
-      "udot z27.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "udot z20.s, z6.b, z3.b[3]\n"
-      "udot z24.s, z6.b, z4.b[3]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "udot z21.s, z7.b, z3.b[3]\n"
-      "udot z25.s, z7.b, z4.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z22.s, z6.b, z3.b[3]\n"
-      "udot z26.s, z6.b, z4.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
-      "udot z23.s, z7.b, z3.b[3]\n"
-      "udot z27.s, z7.b, z4.b[3]\n"
+      "udot z10.s, z29.b, z4.b[1]\n"
+      "udot z14.s, z29.b, z3.b[1]\n"
+      "udot z18.s, z29.b, z2.b[1]\n"
+      "udot z22.s, z29.b, z1.b[1]\n"
+      "udot z26.s, z29.b, z0.b[1]\n"
+      "udot z11.s, z28.b, z4.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "udot z15.s, z28.b, z3.b[1]\n"
+      "udot z19.s, z28.b, z2.b[1]\n"
+      "udot z23.s, z28.b, z1.b[1]\n"
+      "udot z27.s, z28.b, z0.b[1]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "udot z8.s, z29.b, z4.b[2]\n"
+      "udot z12.s, z29.b, z3.b[2]\n"
+      "udot z16.s, z29.b, z2.b[2]\n"
+      "udot z20.s, z29.b, z1.b[2]\n"
+      "udot z24.s, z29.b, z0.b[2]\n"
+      "udot z9.s, z28.b, z4.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "udot z13.s, z28.b, z3.b[2]\n"
+      "udot z17.s, z28.b, z2.b[2]\n"
+      "udot z21.s, z28.b, z1.b[2]\n"
+      "udot z25.s, z28.b, z0.b[2]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "udot z10.s, z29.b, z4.b[2]\n"
+      "udot z14.s, z29.b, z3.b[2]\n"
+      "udot z18.s, z29.b, z2.b[2]\n"
+      "udot z22.s, z29.b, z1.b[2]\n"
+      "udot z26.s, z29.b, z0.b[2]\n"
+      "udot z11.s, z28.b, z4.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "udot z15.s, z28.b, z3.b[2]\n"
+      "udot z19.s, z28.b, z2.b[2]\n"
+      "udot z23.s, z28.b, z1.b[2]\n"
+      "udot z27.s, z28.b, z0.b[2]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "udot z8.s, z29.b, z4.b[3]\n"
+      "udot z12.s, z29.b, z3.b[3]\n"
+      "udot z16.s, z29.b, z2.b[3]\n"
+      "udot z20.s, z29.b, z1.b[3]\n"
+      "udot z24.s, z29.b, z0.b[3]\n"
+      "udot z9.s, z28.b, z4.b[3]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "udot z13.s, z28.b, z3.b[3]\n"
+      "udot z17.s, z28.b, z2.b[3]\n"
+      "udot z21.s, z28.b, z1.b[3]\n"
+      "udot z25.s, z28.b, z0.b[3]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "udot z10.s, z29.b, z4.b[3]\n"
+      "udot z14.s, z29.b, z3.b[3]\n"
+      "udot z18.s, z29.b, z2.b[3]\n"
+      "udot z22.s, z29.b, z1.b[3]\n"
+      "udot z26.s, z29.b, z0.b[3]\n"
+      "udot z11.s, z28.b, z4.b[3]\n"
+      "udot z15.s, z28.b, z3.b[3]\n"
+      "udot z19.s, z28.b, z2.b[3]\n"
+      "udot z23.s, z28.b, z1.b[3]\n"
+      "udot z27.s, z28.b, z0.b[3]\n"
       "bgt 52b\n"
       "53:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
@@ -1188,142 +1188,142 @@
       "ld1rqb { z2.b }, p0/Z, [x24]\n"
       "ld1rqb { z3.b }, p0/Z, [x23]\n"
       "ld1rqb { z4.b }, p0/Z, [x22]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "udot z20.s, z6.b, z3.b[0]\n"
-      "udot z24.s, z6.b, z4.b[0]\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "udot z21.s, z7.b, z3.b[0]\n"
-      "udot z25.s, z7.b, z4.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "udot z8.s, z29.b, z0.b[0]\n"
+      "udot z12.s, z29.b, z1.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z16.s, z29.b, z2.b[0]\n"
+      "udot z20.s, z29.b, z3.b[0]\n"
+      "udot z24.s, z29.b, z4.b[0]\n"
+      "udot z9.s, z28.b, z0.b[0]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z28.b, z1.b[0]\n"
+      "udot z17.s, z28.b, z2.b[0]\n"
+      "udot z21.s, z28.b, z3.b[0]\n"
+      "udot z25.s, z28.b, z4.b[0]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z22.s, z6.b, z3.b[0]\n"
-      "udot z26.s, z6.b, z4.b[0]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
-      "udot z23.s, z7.b, z3.b[0]\n"
-      "udot z27.s, z7.b, z4.b[0]\n"
+      "udot z10.s, z29.b, z0.b[0]\n"
+      "udot z14.s, z29.b, z1.b[0]\n"
+      "udot z18.s, z29.b, z2.b[0]\n"
+      "udot z22.s, z29.b, z3.b[0]\n"
+      "udot z26.s, z29.b, z4.b[0]\n"
+      "udot z11.s, z28.b, z0.b[0]\n"
+      "udot z15.s, z28.b, z1.b[0]\n"
+      "udot z19.s, z28.b, z2.b[0]\n"
+      "udot z23.s, z28.b, z3.b[0]\n"
+      "udot z27.s, z28.b, z4.b[0]\n"
       "ble 54f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "udot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z29.b, z0.b[1]\n"
+      "udot z12.s, z29.b, z1.b[1]\n"
+      "udot z16.s, z29.b, z2.b[1]\n"
+      "udot z20.s, z29.b, z3.b[1]\n"
       "subs x27, x27, #0x4\n"
-      "udot z24.s, z6.b, z4.b[1]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "udot z21.s, z7.b, z3.b[1]\n"
-      "udot z25.s, z7.b, z4.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z24.s, z29.b, z4.b[1]\n"
+      "udot z9.s, z28.b, z0.b[1]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z28.b, z1.b[1]\n"
+      "udot z17.s, z28.b, z2.b[1]\n"
+      "udot z21.s, z28.b, z3.b[1]\n"
+      "udot z25.s, z28.b, z4.b[1]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z22.s, z6.b, z3.b[1]\n"
-      "udot z26.s, z6.b, z4.b[1]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
-      "udot z23.s, z7.b, z3.b[1]\n"
-      "udot z27.s, z7.b, z4.b[1]\n"
+      "udot z10.s, z29.b, z0.b[1]\n"
+      "udot z14.s, z29.b, z1.b[1]\n"
+      "udot z18.s, z29.b, z2.b[1]\n"
+      "udot z22.s, z29.b, z3.b[1]\n"
+      "udot z26.s, z29.b, z4.b[1]\n"
+      "udot z11.s, z28.b, z0.b[1]\n"
+      "udot z15.s, z28.b, z1.b[1]\n"
+      "udot z19.s, z28.b, z2.b[1]\n"
+      "udot z23.s, z28.b, z3.b[1]\n"
+      "udot z27.s, z28.b, z4.b[1]\n"
       "ble 54f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z29.b, z0.b[2]\n"
+      "udot z12.s, z29.b, z1.b[2]\n"
+      "udot z16.s, z29.b, z2.b[2]\n"
+      "udot z20.s, z29.b, z3.b[2]\n"
       "subs x27, x27, #0x4\n"
-      "udot z24.s, z6.b, z4.b[2]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "udot z21.s, z7.b, z3.b[2]\n"
-      "udot z25.s, z7.b, z4.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z24.s, z29.b, z4.b[2]\n"
+      "udot z9.s, z28.b, z0.b[2]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z28.b, z1.b[2]\n"
+      "udot z17.s, z28.b, z2.b[2]\n"
+      "udot z21.s, z28.b, z3.b[2]\n"
+      "udot z25.s, z28.b, z4.b[2]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z22.s, z6.b, z3.b[2]\n"
-      "udot z26.s, z6.b, z4.b[2]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
-      "udot z23.s, z7.b, z3.b[2]\n"
-      "udot z27.s, z7.b, z4.b[2]\n"
+      "udot z10.s, z29.b, z0.b[2]\n"
+      "udot z14.s, z29.b, z1.b[2]\n"
+      "udot z18.s, z29.b, z2.b[2]\n"
+      "udot z22.s, z29.b, z3.b[2]\n"
+      "udot z26.s, z29.b, z4.b[2]\n"
+      "udot z11.s, z28.b, z0.b[2]\n"
+      "udot z15.s, z28.b, z1.b[2]\n"
+      "udot z19.s, z28.b, z2.b[2]\n"
+      "udot z23.s, z28.b, z3.b[2]\n"
+      "udot z27.s, z28.b, z4.b[2]\n"
       "ble 54f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "udot z20.s, z6.b, z3.b[3]\n"
-      "udot z24.s, z6.b, z4.b[3]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "udot z21.s, z7.b, z3.b[3]\n"
-      "udot z25.s, z7.b, z4.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z29.b }, p5/Z, [x10]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z29.b, z0.b[3]\n"
+      "udot z12.s, z29.b, z1.b[3]\n"
+      "udot z16.s, z29.b, z2.b[3]\n"
+      "udot z20.s, z29.b, z3.b[3]\n"
+      "udot z24.s, z29.b, z4.b[3]\n"
+      "udot z9.s, z28.b, z0.b[3]\n"
+      "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z13.s, z28.b, z1.b[3]\n"
+      "udot z17.s, z28.b, z2.b[3]\n"
+      "udot z21.s, z28.b, z3.b[3]\n"
+      "udot z25.s, z28.b, z4.b[3]\n"
+      "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z22.s, z6.b, z3.b[3]\n"
-      "udot z26.s, z6.b, z4.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
-      "udot z23.s, z7.b, z3.b[3]\n"
-      "udot z27.s, z7.b, z4.b[3]\n"
+      "udot z10.s, z29.b, z0.b[3]\n"
+      "udot z14.s, z29.b, z1.b[3]\n"
+      "udot z18.s, z29.b, z2.b[3]\n"
+      "udot z22.s, z29.b, z3.b[3]\n"
+      "udot z26.s, z29.b, z4.b[3]\n"
+      "udot z11.s, z28.b, z0.b[3]\n"
+      "udot z15.s, z28.b, z1.b[3]\n"
+      "udot z19.s, z28.b, z2.b[3]\n"
+      "udot z23.s, z28.b, z3.b[3]\n"
+      "udot z27.s, z28.b, z4.b[3]\n"
       "54:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 49b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "st1w { z8.s }, p4, [x9]\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "st1w { z8.s }, p4, [x9]\n"
       "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
       "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
       "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z12.s }, p4, [x24]\n"
-      "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z20.s }, p4, [x22]\n"
-      "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
-      "st1w { z24.s }, p4, [x21]\n"
-      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
-      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z12.s }, p4, [x23]\n"
+      "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x22]\n"
+      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z20.s }, p4, [x21]\n"
+      "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x20]\n"
+      "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
       "55:"  // Height 5: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -1407,16 +1407,16 @@
       "60:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 61f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 62f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1428,143 +1428,143 @@
       "b 62f\n"
       "61:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "62:"  // Height 6: input setup done
       "cmp x27, #0x10\n"
       "ble 64f\n"
       "63:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z0.b }, p0/Z, [x26]\n"
-      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z7.b }, p0/Z, [x26]\n"
+      "ld1rqb { z6.b }, p0/Z, [x25]\n"
       "sub x27, x27, #0x10\n"
-      "ld1rqb { z2.b }, p0/Z, [x24]\n"
-      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "ld1rqb { z5.b }, p0/Z, [x24]\n"
+      "ld1rqb { z4.b }, p0/Z, [x23]\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      "ld1rqb { z4.b }, p0/Z, [x22]\n"
-      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z2.b }, p0/Z, [x21]\n"
       "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "udot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z1.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z1.b, z7.b[0]\n"
+      "udot z12.s, z1.b, z6.b[0]\n"
+      "udot z16.s, z1.b, z5.b[0]\n"
+      "udot z20.s, z1.b, z4.b[0]\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      "udot z24.s, z6.b, z4.b[0]\n"
-      "udot z28.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z24.s, z1.b, z3.b[0]\n"
+      "udot z28.s, z1.b, z2.b[0]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
       "add x21, x21, #0x10\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "udot z21.s, z7.b, z3.b[0]\n"
-      "udot z25.s, z7.b, z4.b[0]\n"
-      "udot z29.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z22.s, z6.b, z3.b[0]\n"
-      "udot z26.s, z6.b, z4.b[0]\n"
-      "udot z30.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
-      "udot z23.s, z7.b, z3.b[0]\n"
-      "udot z27.s, z7.b, z4.b[0]\n"
-      "udot z31.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "udot z20.s, z6.b, z3.b[1]\n"
-      "udot z24.s, z6.b, z4.b[1]\n"
-      "udot z28.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "udot z21.s, z7.b, z3.b[1]\n"
-      "udot z25.s, z7.b, z4.b[1]\n"
-      "udot z29.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "udot z9.s, z0.b, z7.b[0]\n"
+      "udot z13.s, z0.b, z6.b[0]\n"
+      "udot z17.s, z0.b, z5.b[0]\n"
+      "udot z21.s, z0.b, z4.b[0]\n"
+      "udot z25.s, z0.b, z3.b[0]\n"
+      "udot z29.s, z0.b, z2.b[0]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z10.s, z1.b, z7.b[0]\n"
+      "udot z14.s, z1.b, z6.b[0]\n"
+      "udot z18.s, z1.b, z5.b[0]\n"
+      "udot z22.s, z1.b, z4.b[0]\n"
+      "udot z26.s, z1.b, z3.b[0]\n"
+      "udot z30.s, z1.b, z2.b[0]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "udot z11.s, z0.b, z7.b[0]\n"
+      "udot z15.s, z0.b, z6.b[0]\n"
+      "udot z19.s, z0.b, z5.b[0]\n"
+      "udot z23.s, z0.b, z4.b[0]\n"
+      "udot z27.s, z0.b, z3.b[0]\n"
+      "udot z31.s, z0.b, z2.b[0]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "udot z8.s, z1.b, z7.b[1]\n"
+      "udot z12.s, z1.b, z6.b[1]\n"
+      "udot z16.s, z1.b, z5.b[1]\n"
+      "udot z20.s, z1.b, z4.b[1]\n"
+      "udot z24.s, z1.b, z3.b[1]\n"
+      "udot z28.s, z1.b, z2.b[1]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "udot z9.s, z0.b, z7.b[1]\n"
+      "udot z13.s, z0.b, z6.b[1]\n"
+      "udot z17.s, z0.b, z5.b[1]\n"
+      "udot z21.s, z0.b, z4.b[1]\n"
+      "udot z25.s, z0.b, z3.b[1]\n"
+      "udot z29.s, z0.b, z2.b[1]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z22.s, z6.b, z3.b[1]\n"
-      "udot z26.s, z6.b, z4.b[1]\n"
-      "udot z30.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
-      "udot z23.s, z7.b, z3.b[1]\n"
-      "udot z27.s, z7.b, z4.b[1]\n"
-      "udot z31.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z20.s, z6.b, z3.b[2]\n"
-      "udot z24.s, z6.b, z4.b[2]\n"
-      "udot z28.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "udot z21.s, z7.b, z3.b[2]\n"
-      "udot z25.s, z7.b, z4.b[2]\n"
-      "udot z29.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z22.s, z6.b, z3.b[2]\n"
-      "udot z26.s, z6.b, z4.b[2]\n"
-      "udot z30.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
-      "udot z23.s, z7.b, z3.b[2]\n"
-      "udot z27.s, z7.b, z4.b[2]\n"
-      "udot z31.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "udot z20.s, z6.b, z3.b[3]\n"
-      "udot z24.s, z6.b, z4.b[3]\n"
-      "udot z28.s, z6.b, z5.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "udot z21.s, z7.b, z3.b[3]\n"
-      "udot z25.s, z7.b, z4.b[3]\n"
-      "udot z29.s, z7.b, z5.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z22.s, z6.b, z3.b[3]\n"
-      "udot z26.s, z6.b, z4.b[3]\n"
-      "udot z30.s, z6.b, z5.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
-      "udot z23.s, z7.b, z3.b[3]\n"
-      "udot z27.s, z7.b, z4.b[3]\n"
-      "udot z31.s, z7.b, z5.b[3]\n"
+      "udot z10.s, z1.b, z7.b[1]\n"
+      "udot z14.s, z1.b, z6.b[1]\n"
+      "udot z18.s, z1.b, z5.b[1]\n"
+      "udot z22.s, z1.b, z4.b[1]\n"
+      "udot z26.s, z1.b, z3.b[1]\n"
+      "udot z30.s, z1.b, z2.b[1]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "udot z11.s, z0.b, z7.b[1]\n"
+      "udot z15.s, z0.b, z6.b[1]\n"
+      "udot z19.s, z0.b, z5.b[1]\n"
+      "udot z23.s, z0.b, z4.b[1]\n"
+      "udot z27.s, z0.b, z3.b[1]\n"
+      "udot z31.s, z0.b, z2.b[1]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      "udot z8.s, z1.b, z7.b[2]\n"
+      "udot z12.s, z1.b, z6.b[2]\n"
+      "udot z16.s, z1.b, z5.b[2]\n"
+      "udot z20.s, z1.b, z4.b[2]\n"
+      "udot z24.s, z1.b, z3.b[2]\n"
+      "udot z28.s, z1.b, z2.b[2]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "udot z9.s, z0.b, z7.b[2]\n"
+      "udot z13.s, z0.b, z6.b[2]\n"
+      "udot z17.s, z0.b, z5.b[2]\n"
+      "udot z21.s, z0.b, z4.b[2]\n"
+      "udot z25.s, z0.b, z3.b[2]\n"
+      "udot z29.s, z0.b, z2.b[2]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      "udot z10.s, z1.b, z7.b[2]\n"
+      "udot z14.s, z1.b, z6.b[2]\n"
+      "udot z18.s, z1.b, z5.b[2]\n"
+      "udot z22.s, z1.b, z4.b[2]\n"
+      "udot z26.s, z1.b, z3.b[2]\n"
+      "udot z30.s, z1.b, z2.b[2]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "udot z11.s, z0.b, z7.b[2]\n"
+      "udot z15.s, z0.b, z6.b[2]\n"
+      "udot z19.s, z0.b, z5.b[2]\n"
+      "udot z23.s, z0.b, z4.b[2]\n"
+      "udot z27.s, z0.b, z3.b[2]\n"
+      "udot z31.s, z0.b, z2.b[2]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      "udot z8.s, z1.b, z7.b[3]\n"
+      "udot z12.s, z1.b, z6.b[3]\n"
+      "udot z16.s, z1.b, z5.b[3]\n"
+      "udot z20.s, z1.b, z4.b[3]\n"
+      "udot z24.s, z1.b, z3.b[3]\n"
+      "udot z28.s, z1.b, z2.b[3]\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "udot z9.s, z0.b, z7.b[3]\n"
+      "udot z13.s, z0.b, z6.b[3]\n"
+      "udot z17.s, z0.b, z5.b[3]\n"
+      "udot z21.s, z0.b, z4.b[3]\n"
+      "udot z25.s, z0.b, z3.b[3]\n"
+      "udot z29.s, z0.b, z2.b[3]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "udot z10.s, z1.b, z7.b[3]\n"
+      "udot z14.s, z1.b, z6.b[3]\n"
+      "udot z18.s, z1.b, z5.b[3]\n"
+      "udot z22.s, z1.b, z4.b[3]\n"
+      "udot z26.s, z1.b, z3.b[3]\n"
+      "udot z30.s, z1.b, z2.b[3]\n"
+      "udot z11.s, z0.b, z7.b[3]\n"
+      "udot z15.s, z0.b, z6.b[3]\n"
+      "udot z19.s, z0.b, z5.b[3]\n"
+      "udot z23.s, z0.b, z4.b[3]\n"
+      "udot z27.s, z0.b, z3.b[3]\n"
+      "udot z31.s, z0.b, z2.b[3]\n"
       "bgt 63b\n"
       "64:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
@@ -1575,127 +1575,127 @@
       "ld1rqb { z3.b }, p0/Z, [x23]\n"
       "ld1rqb { z4.b }, p0/Z, [x22]\n"
       "ld1rqb { z5.b }, p0/Z, [x21]\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[0]\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "udot z20.s, z6.b, z3.b[0]\n"
-      "udot z24.s, z6.b, z4.b[0]\n"
-      "udot z28.s, z6.b, z5.b[0]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[0]\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "udot z21.s, z7.b, z3.b[0]\n"
-      "udot z25.s, z7.b, z4.b[0]\n"
-      "udot z29.s, z7.b, z5.b[0]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z7.b, z0.b[0]\n"
+      "udot z12.s, z7.b, z1.b[0]\n"
+      "udot z16.s, z7.b, z2.b[0]\n"
+      "udot z20.s, z7.b, z3.b[0]\n"
+      "udot z24.s, z7.b, z4.b[0]\n"
+      "udot z28.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z9.s, z6.b, z0.b[0]\n"
+      "udot z13.s, z6.b, z1.b[0]\n"
+      "udot z17.s, z6.b, z2.b[0]\n"
+      "udot z21.s, z6.b, z3.b[0]\n"
+      "udot z25.s, z6.b, z4.b[0]\n"
+      "udot z29.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[0]\n"
-      "udot z14.s, z6.b, z1.b[0]\n"
-      "udot z18.s, z6.b, z2.b[0]\n"
-      "udot z22.s, z6.b, z3.b[0]\n"
-      "udot z26.s, z6.b, z4.b[0]\n"
-      "udot z30.s, z6.b, z5.b[0]\n"
-      "udot z11.s, z7.b, z0.b[0]\n"
-      "udot z15.s, z7.b, z1.b[0]\n"
-      "udot z19.s, z7.b, z2.b[0]\n"
-      "udot z23.s, z7.b, z3.b[0]\n"
-      "udot z27.s, z7.b, z4.b[0]\n"
-      "udot z31.s, z7.b, z5.b[0]\n"
+      "udot z10.s, z7.b, z0.b[0]\n"
+      "udot z14.s, z7.b, z1.b[0]\n"
+      "udot z18.s, z7.b, z2.b[0]\n"
+      "udot z22.s, z7.b, z3.b[0]\n"
+      "udot z26.s, z7.b, z4.b[0]\n"
+      "udot z30.s, z7.b, z5.b[0]\n"
+      "udot z11.s, z6.b, z0.b[0]\n"
+      "udot z15.s, z6.b, z1.b[0]\n"
+      "udot z19.s, z6.b, z2.b[0]\n"
+      "udot z23.s, z6.b, z3.b[0]\n"
+      "udot z27.s, z6.b, z4.b[0]\n"
+      "udot z31.s, z6.b, z5.b[0]\n"
       "ble 65f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[1]\n"
-      "udot z12.s, z6.b, z1.b[1]\n"
-      "udot z16.s, z6.b, z2.b[1]\n"
-      "udot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z7.b, z0.b[1]\n"
+      "udot z12.s, z7.b, z1.b[1]\n"
+      "udot z16.s, z7.b, z2.b[1]\n"
+      "udot z20.s, z7.b, z3.b[1]\n"
       "subs x27, x27, #0x4\n"
-      "udot z24.s, z6.b, z4.b[1]\n"
-      "udot z28.s, z6.b, z5.b[1]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[1]\n"
-      "udot z13.s, z7.b, z1.b[1]\n"
-      "udot z17.s, z7.b, z2.b[1]\n"
-      "udot z21.s, z7.b, z3.b[1]\n"
-      "udot z25.s, z7.b, z4.b[1]\n"
-      "udot z29.s, z7.b, z5.b[1]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z24.s, z7.b, z4.b[1]\n"
+      "udot z28.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z9.s, z6.b, z0.b[1]\n"
+      "udot z13.s, z6.b, z1.b[1]\n"
+      "udot z17.s, z6.b, z2.b[1]\n"
+      "udot z21.s, z6.b, z3.b[1]\n"
+      "udot z25.s, z6.b, z4.b[1]\n"
+      "udot z29.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[1]\n"
-      "udot z14.s, z6.b, z1.b[1]\n"
-      "udot z18.s, z6.b, z2.b[1]\n"
-      "udot z22.s, z6.b, z3.b[1]\n"
-      "udot z26.s, z6.b, z4.b[1]\n"
-      "udot z30.s, z6.b, z5.b[1]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z15.s, z7.b, z1.b[1]\n"
-      "udot z19.s, z7.b, z2.b[1]\n"
-      "udot z23.s, z7.b, z3.b[1]\n"
-      "udot z27.s, z7.b, z4.b[1]\n"
-      "udot z31.s, z7.b, z5.b[1]\n"
+      "udot z10.s, z7.b, z0.b[1]\n"
+      "udot z14.s, z7.b, z1.b[1]\n"
+      "udot z18.s, z7.b, z2.b[1]\n"
+      "udot z22.s, z7.b, z3.b[1]\n"
+      "udot z26.s, z7.b, z4.b[1]\n"
+      "udot z30.s, z7.b, z5.b[1]\n"
+      "udot z11.s, z6.b, z0.b[1]\n"
+      "udot z15.s, z6.b, z1.b[1]\n"
+      "udot z19.s, z6.b, z2.b[1]\n"
+      "udot z23.s, z6.b, z3.b[1]\n"
+      "udot z27.s, z6.b, z4.b[1]\n"
+      "udot z31.s, z6.b, z5.b[1]\n"
       "ble 65f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[2]\n"
-      "udot z12.s, z6.b, z1.b[2]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z7.b, z0.b[2]\n"
+      "udot z12.s, z7.b, z1.b[2]\n"
+      "udot z16.s, z7.b, z2.b[2]\n"
+      "udot z20.s, z7.b, z3.b[2]\n"
       "subs x27, x27, #0x4\n"
-      "udot z24.s, z6.b, z4.b[2]\n"
-      "udot z28.s, z6.b, z5.b[2]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[2]\n"
-      "udot z13.s, z7.b, z1.b[2]\n"
-      "udot z17.s, z7.b, z2.b[2]\n"
-      "udot z21.s, z7.b, z3.b[2]\n"
-      "udot z25.s, z7.b, z4.b[2]\n"
-      "udot z29.s, z7.b, z5.b[2]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "udot z24.s, z7.b, z4.b[2]\n"
+      "udot z28.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z9.s, z6.b, z0.b[2]\n"
+      "udot z13.s, z6.b, z1.b[2]\n"
+      "udot z17.s, z6.b, z2.b[2]\n"
+      "udot z21.s, z6.b, z3.b[2]\n"
+      "udot z25.s, z6.b, z4.b[2]\n"
+      "udot z29.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[2]\n"
-      "udot z14.s, z6.b, z1.b[2]\n"
-      "udot z18.s, z6.b, z2.b[2]\n"
-      "udot z22.s, z6.b, z3.b[2]\n"
-      "udot z26.s, z6.b, z4.b[2]\n"
-      "udot z30.s, z6.b, z5.b[2]\n"
-      "udot z11.s, z7.b, z0.b[2]\n"
-      "udot z15.s, z7.b, z1.b[2]\n"
-      "udot z19.s, z7.b, z2.b[2]\n"
-      "udot z23.s, z7.b, z3.b[2]\n"
-      "udot z27.s, z7.b, z4.b[2]\n"
-      "udot z31.s, z7.b, z5.b[2]\n"
+      "udot z10.s, z7.b, z0.b[2]\n"
+      "udot z14.s, z7.b, z1.b[2]\n"
+      "udot z18.s, z7.b, z2.b[2]\n"
+      "udot z22.s, z7.b, z3.b[2]\n"
+      "udot z26.s, z7.b, z4.b[2]\n"
+      "udot z30.s, z7.b, z5.b[2]\n"
+      "udot z11.s, z6.b, z0.b[2]\n"
+      "udot z15.s, z6.b, z1.b[2]\n"
+      "udot z19.s, z6.b, z2.b[2]\n"
+      "udot z23.s, z6.b, z3.b[2]\n"
+      "udot z27.s, z6.b, z4.b[2]\n"
+      "udot z31.s, z6.b, z5.b[2]\n"
       "ble 65f\n"
-      "ld1b { z6.b }, p5/Z, [x10]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n"
-      "udot z8.s, z6.b, z0.b[3]\n"
-      "udot z12.s, z6.b, z1.b[3]\n"
-      "udot z16.s, z6.b, z2.b[3]\n"
-      "udot z20.s, z6.b, z3.b[3]\n"
-      "udot z24.s, z6.b, z4.b[3]\n"
-      "udot z28.s, z6.b, z5.b[3]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "udot z9.s, z7.b, z0.b[3]\n"
-      "udot z13.s, z7.b, z1.b[3]\n"
-      "udot z17.s, z7.b, z2.b[3]\n"
-      "udot z21.s, z7.b, z3.b[3]\n"
-      "udot z25.s, z7.b, z4.b[3]\n"
-      "udot z29.s, z7.b, z5.b[3]\n"
-      "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x10]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
+      "udot z8.s, z7.b, z0.b[3]\n"
+      "udot z12.s, z7.b, z1.b[3]\n"
+      "udot z16.s, z7.b, z2.b[3]\n"
+      "udot z20.s, z7.b, z3.b[3]\n"
+      "udot z24.s, z7.b, z4.b[3]\n"
+      "udot z28.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "udot z9.s, z6.b, z0.b[3]\n"
+      "udot z13.s, z6.b, z1.b[3]\n"
+      "udot z17.s, z6.b, z2.b[3]\n"
+      "udot z21.s, z6.b, z3.b[3]\n"
+      "udot z25.s, z6.b, z4.b[3]\n"
+      "udot z29.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
-      "udot z10.s, z6.b, z0.b[3]\n"
-      "udot z14.s, z6.b, z1.b[3]\n"
-      "udot z18.s, z6.b, z2.b[3]\n"
-      "udot z22.s, z6.b, z3.b[3]\n"
-      "udot z26.s, z6.b, z4.b[3]\n"
-      "udot z30.s, z6.b, z5.b[3]\n"
-      "udot z11.s, z7.b, z0.b[3]\n"
-      "udot z15.s, z7.b, z1.b[3]\n"
-      "udot z19.s, z7.b, z2.b[3]\n"
-      "udot z23.s, z7.b, z3.b[3]\n"
-      "udot z27.s, z7.b, z4.b[3]\n"
-      "udot z31.s, z7.b, z5.b[3]\n"
+      "udot z10.s, z7.b, z0.b[3]\n"
+      "udot z14.s, z7.b, z1.b[3]\n"
+      "udot z18.s, z7.b, z2.b[3]\n"
+      "udot z22.s, z7.b, z3.b[3]\n"
+      "udot z26.s, z7.b, z4.b[3]\n"
+      "udot z30.s, z7.b, z5.b[3]\n"
+      "udot z11.s, z6.b, z0.b[3]\n"
+      "udot z15.s, z6.b, z1.b[3]\n"
+      "udot z19.s, z6.b, z2.b[3]\n"
+      "udot z23.s, z6.b, z3.b[3]\n"
+      "udot z27.s, z6.b, z4.b[3]\n"
+      "udot z31.s, z6.b, z5.b[3]\n"
       "65:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1748,7 +1748,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "68:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1756,4 +1755,4 @@
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
index c0d0892..8c6a3db 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -74,7 +74,6 @@
     template<typename T>
     static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-
         if (std::is_same<T, uint32_t>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -86,7 +85,6 @@
             }
         }
 
-
         if (std::is_same<T, uint8_t>::value) {
             switch (ci->get_cpu_model()) {
                 default:
@@ -111,5 +109,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp
index 59f3328..9269576 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp
@@ -100,16 +100,16 @@
       "incw x20\n"
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 3f\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
       "b 4f\n"
@@ -127,11 +127,11 @@
       "5:"  // Height 1: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 6f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
       "cbnz x28, 7f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -143,86 +143,86 @@
       "ble 9f\n"
       "8:"  // Height 1: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
-      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
-      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
-      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1rqb { z20.b }, p0/Z, [x26]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d19a48  // ummla z8.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4c  // ummla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d19a49  // ummla z9.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4d  // ummla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d19a4a  // ummla z10.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4e  // ummla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
-      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
-      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
-      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
-      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x45d19a4b  // ummla z11.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4f  // ummla z15.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45d19a88  // ummla z8.s, z20.b, z17.b\n"
+      ".inst 0x45d09a8c  // ummla z12.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45d19a89  // ummla z9.s, z20.b, z17.b\n"
+      ".inst 0x45d09a8d  // ummla z13.s, z20.b, z16.b\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45d09a8a  // ummla z10.s, z20.b, z16.b\n"
+      ".inst 0x45c79a8e  // ummla z14.s, z20.b, z7.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
       "sub x27, x27, #0x10\n"
       "cmp x27, #0x10\n"
-      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
-      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      ".inst 0x45d19a8b  // ummla z11.s, z20.b, z17.b\n"
+      ".inst 0x45d09a8f  // ummla z15.s, z20.b, z16.b\n"
       "add x26, x26, #0x10\n"
       "bgt 8b\n"
       "9:"  // Height 1: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
-      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
-      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
-      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d19a48  // ummla z8.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4c  // ummla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d19a49  // ummla z9.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4d  // ummla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d19a4a  // ummla z10.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4e  // ummla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
-      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x45d19a4b  // ummla z11.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4f  // ummla z15.s, z18.b, z16.b\n"
       "addvl x10, x10, #8\n"
       "ble 10f\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
-      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
-      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
-      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
-      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d19828  // ummla z8.s, z1.b, z17.b\n"
+      ".inst 0x45d0982c  // ummla z12.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d19829  // ummla z9.s, z1.b, z17.b\n"
+      ".inst 0x45d0982d  // ummla z13.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d1982a  // ummla z10.s, z1.b, z17.b\n"
+      ".inst 0x45d0982e  // ummla z14.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45d1982b  // ummla z11.s, z1.b, z17.b\n"
+      ".inst 0x45d0982f  // ummla z15.s, z1.b, z16.b\n"
       "addvl x10, x10, #8\n"
       "10:"  // Height 1: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -258,21 +258,21 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 14f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x9, x20, LSL #2\n"
+      "ld1w { z18.s }, p4/Z, [x9]\n"
+      "ld1w { z2.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z12.s }, p4/Z, [x20]\n"
+      "zip1 z8.d, z18.d, z12.d\n"
+      "zip2 z12.d, z18.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z9.d, z2.d, z13.d\n"
+      "zip2 z13.d, z2.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z10.d, z17.d, z14.d\n"
+      "zip2 z14.d, z17.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
       "b 15f\n"
@@ -290,12 +290,12 @@
       "16:"  // Height 2: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 17f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
       "cbnz x28, 18f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -303,95 +303,95 @@
       "b 18f\n"
       "17:"  // Height 2: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
+      "add x25, x26, x21\n"
       "18:"  // Height 2: input setup done
       "cmp x27, #0x10\n"
       "ble 20f\n"
       "19:"  // Height 2: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
-      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
-      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
-      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1rqb { z20.b }, p0/Z, [x26]\n"
+      "ld1rqb { z19.b }, p0/Z, [x25]\n"
+      "trn1 z18.d, z20.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d19a48  // ummla z8.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4c  // ummla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d19a49  // ummla z9.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4d  // ummla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d19a4a  // ummla z10.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4e  // ummla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
-      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
-      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
-      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
-      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      "trn2 z20.d, z20.d, z19.d\n"
+      ".inst 0x45d19a4b  // ummla z11.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4f  // ummla z15.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45d19a88  // ummla z8.s, z20.b, z17.b\n"
+      ".inst 0x45d09a8c  // ummla z12.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45d19a89  // ummla z9.s, z20.b, z17.b\n"
+      ".inst 0x45d09a8d  // ummla z13.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45d19a8a  // ummla z10.s, z20.b, z17.b\n"
+      ".inst 0x45d09a8e  // ummla z14.s, z20.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
       "sub x27, x27, #0x10\n"
       "cmp x27, #0x10\n"
-      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
-      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      ".inst 0x45d19a8b  // ummla z11.s, z20.b, z17.b\n"
+      ".inst 0x45d09a8f  // ummla z15.s, z20.b, z16.b\n"
       "add x26, x26, #0x10\n"
       "add x25, x25, #0x10\n"
       "bgt 19b\n"
       "20:"  // Height 2: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
-      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
-      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
-      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1rqb { z19.b }, p0/Z, [x25]\n"
+      "trn1 z18.d, z1.d, z19.d\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d19a48  // ummla z8.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4c  // ummla z12.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d19a49  // ummla z9.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4d  // ummla z13.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d19a4a  // ummla z10.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4e  // ummla z14.s, z18.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
-      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
+      "trn2 z1.d, z1.d, z19.d\n"
+      ".inst 0x45d19a4b  // ummla z11.s, z18.b, z17.b\n"
+      ".inst 0x45d09a4f  // ummla z15.s, z18.b, z16.b\n"
       "addvl x10, x10, #8\n"
       "ble 21f\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
-      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
-      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
-      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
-      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      "ld1b { z17.b }, p5/Z, [x10]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d19828  // ummla z8.s, z1.b, z17.b\n"
+      ".inst 0x45d0982c  // ummla z12.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d19829  // ummla z9.s, z1.b, z17.b\n"
+      ".inst 0x45d0982d  // ummla z13.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d1982a  // ummla z10.s, z1.b, z17.b\n"
+      ".inst 0x45d0982e  // ummla z14.s, z1.b, z16.b\n"
+      "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
+      "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45d1982b  // ummla z11.s, z1.b, z17.b\n"
+      ".inst 0x45d0982f  // ummla z15.s, z1.b, z16.b\n"
       "addvl x10, x10, #8\n"
       "21:"  // Height 2: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
@@ -399,24 +399,24 @@
       "cmp x28, x20\n"
       "bne 16b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
+      "add x20, x9, x20, LSL #2\n"
+      "uzp1 z16.d, z8.d, z12.d\n"
       "uzp2 z8.d, z8.d, z12.d\n"
-      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp1 z17.d, z9.d, z13.d\n"
       "uzp2 z9.d, z9.d, z13.d\n"
-      "st1w { z7.s }, p4, [x9]\n"
-      "uzp1 z13.d, z10.d, z14.d\n"
+      "st1w { z16.s }, p4, [x9]\n"
+      "uzp1 z16.d, z10.d, z14.d\n"
       "uzp2 z10.d, z10.d, z14.d\n"
-      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
-      "uzp1 z14.d, z11.d, z15.d\n"
+      "st1w { z17.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp1 z2.d, z11.d, z15.d\n"
       "uzp2 z11.d, z11.d, z15.d\n"
-      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
-      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "st1w { z16.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
-      "st1w { z8.s }, p4, [x24]\n"
-      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z8.s }, p4, [x20]\n"
+      "st1w { z9.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x20, #3, MUL VL]\n"
       "22:"  // Height 2: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -437,28 +437,28 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 25f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x23]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x21]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x20]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
       "zip1 z17.d, z18.d, z21.d\n"
@@ -490,13 +490,13 @@
       "27:"  // Height 3: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 28f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
       "cbnz x28, 29f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -505,169 +505,169 @@
       "b 29f\n"
       "28:"  // Height 3: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
       "29:"  // Height 3: input setup done
       "cmp x27, #0x10\n"
       "ble 31f\n"
       "30:"  // Height 3: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
-      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
-      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
-      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
-      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
-      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "ld1rqb { z30.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "ld1rqb { z28.b }, p0/Z, [x24]\n"
+      "trn1 z27.d, z30.d, z24.d\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "trn1 z26.d, z28.d, z29.d\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d99b68  // ummla z8.s, z27.b, z25.b\n"
+      ".inst 0x45d99b50  // ummla z16.s, z26.b, z25.b\n"
+      ".inst 0x45d89b6c  // ummla z12.s, z27.b, z24.b\n"
+      ".inst 0x45d89b54  // ummla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d99b69  // ummla z9.s, z27.b, z25.b\n"
+      ".inst 0x45d99b51  // ummla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z29.d\n"
+      ".inst 0x45d89b6d  // ummla z13.s, z27.b, z24.b\n"
+      ".inst 0x45d89b55  // ummla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
-      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45d99b6a  // ummla z10.s, z27.b, z25.b\n"
+      ".inst 0x45d99b52  // ummla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
       "cmp x27, #0x10\n"
-      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
-      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45d89b6e  // ummla z14.s, z27.b, z24.b\n"
+      ".inst 0x45d89b56  // ummla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
-      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x45d99b6b  // ummla z11.s, z27.b, z25.b\n"
+      ".inst 0x45d99b53  // ummla z19.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
-      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45d89b6f  // ummla z15.s, z27.b, z24.b\n"
+      ".inst 0x45d89b57  // ummla z23.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
-      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x45d99bc8  // ummla z8.s, z30.b, z25.b\n"
+      ".inst 0x45d99b90  // ummla z16.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
-      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
-      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
-      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
-      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
-      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
-      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
-      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
-      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
+      ".inst 0x45d89bcc  // ummla z12.s, z30.b, z24.b\n"
+      ".inst 0x45d89b94  // ummla z20.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45d99bc9  // ummla z9.s, z30.b, z25.b\n"
+      ".inst 0x45d99b91  // ummla z17.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x45d89bcd  // ummla z13.s, z30.b, z24.b\n"
+      ".inst 0x45d89b95  // ummla z21.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45d99bca  // ummla z10.s, z30.b, z25.b\n"
+      ".inst 0x45d99b92  // ummla z18.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x45d89bce  // ummla z14.s, z30.b, z24.b\n"
+      ".inst 0x45d89b96  // ummla z22.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x45d99bcb  // ummla z11.s, z30.b, z25.b\n"
+      ".inst 0x45d99b93  // ummla z19.s, z28.b, z25.b\n"
+      ".inst 0x45d89bcf  // ummla z15.s, z30.b, z24.b\n"
+      ".inst 0x45d89b97  // ummla z23.s, z28.b, z24.b\n"
       "bgt 30b\n"
       "31:"  // Height 3: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
       "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
-      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
-      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
-      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
-      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn1 z27.d, z1.d, z24.d\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "trn1 z26.d, z3.d, z28.d\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d99b68  // ummla z8.s, z27.b, z25.b\n"
+      ".inst 0x45d99b50  // ummla z16.s, z26.b, z25.b\n"
+      ".inst 0x45d89b6c  // ummla z12.s, z27.b, z24.b\n"
+      ".inst 0x45d89b54  // ummla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d99b69  // ummla z9.s, z27.b, z25.b\n"
+      ".inst 0x45d99b51  // ummla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
-      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
-      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
-      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45d89b6d  // ummla z13.s, z27.b, z24.b\n"
+      ".inst 0x45d89b55  // ummla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z28.d\n"
+      ".inst 0x45d99b6a  // ummla z10.s, z27.b, z25.b\n"
+      ".inst 0x45d99b52  // ummla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45d89b6e  // ummla z14.s, z27.b, z24.b\n"
+      ".inst 0x45d89b56  // ummla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
-      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
-      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
-      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
+      ".inst 0x45d99b6b  // ummla z11.s, z27.b, z25.b\n"
+      ".inst 0x45d99b53  // ummla z19.s, z26.b, z25.b\n"
+      ".inst 0x45d89b6f  // ummla z15.s, z27.b, z24.b\n"
+      ".inst 0x45d89b57  // ummla z23.s, z26.b, z24.b\n"
       "ble 32f\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
-      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
-      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
-      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
-      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
-      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
-      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
-      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d99828  // ummla z8.s, z1.b, z25.b\n"
+      ".inst 0x45d99870  // ummla z16.s, z3.b, z25.b\n"
+      ".inst 0x45d8982c  // ummla z12.s, z1.b, z24.b\n"
+      ".inst 0x45d89874  // ummla z20.s, z3.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d99829  // ummla z9.s, z1.b, z25.b\n"
+      ".inst 0x45d99871  // ummla z17.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45d8982d  // ummla z13.s, z1.b, z24.b\n"
+      ".inst 0x45d89875  // ummla z21.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d9982a  // ummla z10.s, z1.b, z25.b\n"
+      ".inst 0x45d99872  // ummla z18.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45d8982e  // ummla z14.s, z1.b, z24.b\n"
+      ".inst 0x45d89876  // ummla z22.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
-      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
-      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
-      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
+      ".inst 0x45d9982b  // ummla z11.s, z1.b, z25.b\n"
+      ".inst 0x45d99873  // ummla z19.s, z3.b, z25.b\n"
+      ".inst 0x45d8982f  // ummla z15.s, z1.b, z24.b\n"
+      ".inst 0x45d89877  // ummla z23.s, z3.b, z24.b\n"
       "32:"  // Height 3: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 27b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
+      "add x21, x9, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
+      "uzp1 z25.d, z8.d, z12.d\n"
       "uzp2 z8.d, z8.d, z12.d\n"
-      "uzp1 z12.d, z9.d, z13.d\n"
-      "st1w { z7.s }, p4, [x9]\n"
+      "uzp1 z24.d, z9.d, z13.d\n"
+      "st1w { z25.s }, p4, [x9]\n"
       "uzp2 z9.d, z9.d, z13.d\n"
-      "uzp1 z13.d, z10.d, z14.d\n"
-      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp1 z25.d, z10.d, z14.d\n"
+      "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
       "uzp2 z10.d, z10.d, z14.d\n"
-      "uzp1 z14.d, z11.d, z15.d\n"
-      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "uzp1 z24.d, z11.d, z15.d\n"
+      "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
       "uzp2 z11.d, z11.d, z15.d\n"
       "uzp1 z16.d, z16.d, z20.d\n"
-      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
       "uzp1 z17.d, z17.d, z21.d\n"
       "uzp1 z18.d, z18.d, z22.d\n"
-      "st1w { z8.s }, p4, [x24]\n"
+      "st1w { z8.s }, p4, [x21]\n"
       "uzp1 z19.d, z19.d, z23.d\n"
-      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
-      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
-      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x23]\n"
-      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z9.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x20]\n"
+      "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
       "33:"  // Height 3: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -688,37 +688,37 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 36f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
-      "add x22, x23, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x23]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x22]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x21]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x20]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
-      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z17.d, z18.d, z21.d\n"
       "zip2 z21.d, z18.d, z21.d\n"
-      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z18.d, z19.d, z22.d\n"
       "zip2 z22.d, z19.d, z22.d\n"
       "zip1 z19.d, z24.d, z23.d\n"
@@ -746,14 +746,14 @@
       "38:"  // Height 4: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 39f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
       "cbnz x28, 40f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -763,182 +763,182 @@
       "b 40f\n"
       "39:"  // Height 4: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
       "40:"  // Height 4: input setup done
       "cmp x27, #0x10\n"
       "ble 42f\n"
       "41:"  // Height 4: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
-      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
-      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
-      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
-      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
-      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "ld1rqb { z30.b }, p0/Z, [x26]\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "trn1 z29.d, z30.d, z24.d\n"
+      "ld1rqb { z28.b }, p0/Z, [x24]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn2 z30.d, z30.d, z24.d\n"
+      "trn1 z26.d, z28.d, z27.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d99ba8  // ummla z8.s, z29.b, z25.b\n"
+      ".inst 0x45d99b50  // ummla z16.s, z26.b, z25.b\n"
+      ".inst 0x45d89bac  // ummla z12.s, z29.b, z24.b\n"
+      ".inst 0x45d89b54  // ummla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d99ba9  // ummla z9.s, z29.b, z25.b\n"
+      ".inst 0x45d99b51  // ummla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "trn2 z28.d, z28.d, z27.d\n"
+      ".inst 0x45d89bad  // ummla z13.s, z29.b, z24.b\n"
+      ".inst 0x45d89b55  // ummla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
-      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45d99baa  // ummla z10.s, z29.b, z25.b\n"
+      ".inst 0x45d99b52  // ummla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
       "cmp x27, #0x10\n"
-      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
-      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45d89bae  // ummla z14.s, z29.b, z24.b\n"
+      ".inst 0x45d89b56  // ummla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
-      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x45d99bab  // ummla z11.s, z29.b, z25.b\n"
+      ".inst 0x45d99b53  // ummla z19.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
-      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45d89baf  // ummla z15.s, z29.b, z24.b\n"
+      ".inst 0x45d89b57  // ummla z23.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
-      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x45d99bc8  // ummla z8.s, z30.b, z25.b\n"
+      ".inst 0x45d99b90  // ummla z16.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
-      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45d89bcc  // ummla z12.s, z30.b, z24.b\n"
+      ".inst 0x45d89b94  // ummla z20.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
       "add x23, x23, #0x10\n"
-      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
-      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
-      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
-      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
-      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
-      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
-      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
-      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
+      ".inst 0x45d99bc9  // ummla z9.s, z30.b, z25.b\n"
+      ".inst 0x45d99b91  // ummla z17.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x45d89bcd  // ummla z13.s, z30.b, z24.b\n"
+      ".inst 0x45d89b95  // ummla z21.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45d99bca  // ummla z10.s, z30.b, z25.b\n"
+      ".inst 0x45d99b92  // ummla z18.s, z28.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x45d89bce  // ummla z14.s, z30.b, z24.b\n"
+      ".inst 0x45d89b96  // ummla z22.s, z28.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x45d99bcb  // ummla z11.s, z30.b, z25.b\n"
+      ".inst 0x45d99b93  // ummla z19.s, z28.b, z25.b\n"
+      ".inst 0x45d89bcf  // ummla z15.s, z30.b, z24.b\n"
+      ".inst 0x45d89b97  // ummla z23.s, z28.b, z24.b\n"
       "bgt 41b\n"
       "42:"  // Height 4: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z24.b }, p0/Z, [x25]\n"
+      "trn1 z28.d, z1.d, z24.d\n"
       "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
-      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
-      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
-      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
-      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+      "ld1rqb { z27.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z24.d\n"
+      "trn1 z26.d, z3.d, z27.d\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d99b88  // ummla z8.s, z28.b, z25.b\n"
+      ".inst 0x45d99b50  // ummla z16.s, z26.b, z25.b\n"
+      ".inst 0x45d89b8c  // ummla z12.s, z28.b, z24.b\n"
+      ".inst 0x45d89b54  // ummla z20.s, z26.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d99b89  // ummla z9.s, z28.b, z25.b\n"
+      ".inst 0x45d99b51  // ummla z17.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
-      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
-      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
-      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45d89b8d  // ummla z13.s, z28.b, z24.b\n"
+      ".inst 0x45d89b55  // ummla z21.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z27.d\n"
+      ".inst 0x45d99b8a  // ummla z10.s, z28.b, z25.b\n"
+      ".inst 0x45d99b52  // ummla z18.s, z26.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45d89b8e  // ummla z14.s, z28.b, z24.b\n"
+      ".inst 0x45d89b56  // ummla z22.s, z26.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
-      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
-      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
-      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
+      ".inst 0x45d99b8b  // ummla z11.s, z28.b, z25.b\n"
+      ".inst 0x45d99b53  // ummla z19.s, z26.b, z25.b\n"
+      ".inst 0x45d89b8f  // ummla z15.s, z28.b, z24.b\n"
+      ".inst 0x45d89b57  // ummla z23.s, z26.b, z24.b\n"
       "ble 43f\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
-      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
-      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
-      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
-      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
-      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
-      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
-      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1b { z25.b }, p5/Z, [x10]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45d99828  // ummla z8.s, z1.b, z25.b\n"
+      ".inst 0x45d99870  // ummla z16.s, z3.b, z25.b\n"
+      ".inst 0x45d8982c  // ummla z12.s, z1.b, z24.b\n"
+      ".inst 0x45d89874  // ummla z20.s, z3.b, z24.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45d99829  // ummla z9.s, z1.b, z25.b\n"
+      ".inst 0x45d99871  // ummla z17.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45d8982d  // ummla z13.s, z1.b, z24.b\n"
+      ".inst 0x45d89875  // ummla z21.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45d9982a  // ummla z10.s, z1.b, z25.b\n"
+      ".inst 0x45d99872  // ummla z18.s, z3.b, z25.b\n"
+      "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45d8982e  // ummla z14.s, z1.b, z24.b\n"
+      ".inst 0x45d89876  // ummla z22.s, z3.b, z24.b\n"
+      "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
-      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
-      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
-      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
+      ".inst 0x45d9982b  // ummla z11.s, z1.b, z25.b\n"
+      ".inst 0x45d99873  // ummla z19.s, z3.b, z25.b\n"
+      ".inst 0x45d8982f  // ummla z15.s, z1.b, z24.b\n"
+      ".inst 0x45d89877  // ummla z23.s, z3.b, z24.b\n"
       "43:"  // Height 4: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 38b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
-      "add x22, x23, x20, LSL #2\n"
+      "add x22, x9, x20, LSL #2\n"
+      "add x21, x22, x20, LSL #2\n"
+      "uzp1 z25.d, z8.d, z12.d\n"
+      "add x20, x21, x20, LSL #2\n"
       "uzp2 z8.d, z8.d, z12.d\n"
-      "uzp1 z12.d, z9.d, z13.d\n"
-      "st1w { z7.s }, p4, [x9]\n"
+      "uzp1 z24.d, z9.d, z13.d\n"
+      "st1w { z25.s }, p4, [x9]\n"
       "uzp2 z9.d, z9.d, z13.d\n"
-      "uzp1 z13.d, z10.d, z14.d\n"
-      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp1 z25.d, z10.d, z14.d\n"
+      "st1w { z24.s }, p3, [x9, #1, MUL VL]\n"
       "uzp2 z10.d, z10.d, z14.d\n"
-      "uzp1 z14.d, z11.d, z15.d\n"
-      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "uzp1 z24.d, z11.d, z15.d\n"
+      "st1w { z25.s }, p2, [x9, #2, MUL VL]\n"
       "uzp2 z11.d, z11.d, z15.d\n"
-      "uzp1 z15.d, z16.d, z20.d\n"
-      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "uzp1 z25.d, z16.d, z20.d\n"
+      "st1w { z24.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
       "uzp2 z16.d, z16.d, z20.d\n"
-      "uzp1 z20.d, z17.d, z21.d\n"
-      "st1w { z8.s }, p4, [x24]\n"
+      "uzp1 z24.d, z17.d, z21.d\n"
+      "st1w { z8.s }, p4, [x22]\n"
       "uzp2 z17.d, z17.d, z21.d\n"
       "uzp1 z21.d, z18.d, z22.d\n"
-      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z9.s }, p3, [x22, #1, MUL VL]\n"
       "uzp2 z18.d, z18.d, z22.d\n"
-      "uzp1 z22.d, z19.d, z23.d\n"
-      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "uzp1 z20.d, z19.d, z23.d\n"
+      "st1w { z10.s }, p2, [x22, #2, MUL VL]\n"
       "uzp2 z19.d, z19.d, z23.d\n"
-      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
-      "st1w { z15.s }, p4, [x23]\n"
-      "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x22]\n"
-      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z11.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z25.s }, p4, [x21]\n"
+      "st1w { z24.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z20.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x20]\n"
+      "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
       "44:"  // Height 4: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -959,54 +959,54 @@
       "whilelt p1.s, x20, x11\n"
       "tbz %x[flags], #0, 47f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "ld1w { z19.s }, p4/Z, [x9]\n"
       "add x21, x22, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "add x20, x21, x20, LSL #2\n"
+      "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
-      "ld1w { z12.s }, p4/Z, [x24]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "zip1 z9.d, z10.d, z13.d\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
-      "ld1w { z17.s }, p4/Z, [x23]\n"
-      "zip1 z10.d, z11.d, z14.d\n"
-      "zip2 z14.d, z11.d, z14.d\n"
-      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z19.d, z12.d\n"
+      "zip2 z12.d, z19.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip1 z9.d, z17.d, z13.d\n"
+      "zip2 z13.d, z17.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip1 z10.d, z18.d, z14.d\n"
+      "zip2 z14.d, z18.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "zip2 z15.d, z16.d, z15.d\n"
-      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
-      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
       "zip1 z16.d, z17.d, z20.d\n"
       "zip2 z20.d, z17.d, z20.d\n"
-      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip1 z17.d, z18.d, z21.d\n"
       "zip2 z21.d, z18.d, z21.d\n"
-      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z25.s }, p4/Z, [x21]\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z25.s }, p4/Z, [x20]\n"
       "zip1 z18.d, z19.d, z22.d\n"
       "zip2 z22.d, z19.d, z22.d\n"
-      "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
       "zip1 z19.d, z24.d, z23.d\n"
       "zip2 z23.d, z24.d, z23.d\n"
-      "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
       "zip1 z24.d, z25.d, z28.d\n"
       "zip2 z28.d, z25.d, z28.d\n"
       "zip1 z25.d, z26.d, z29.d\n"
       "zip2 z29.d, z26.d, z29.d\n"
       "zip1 z26.d, z27.d, z30.d\n"
       "zip2 z30.d, z27.d, z30.d\n"
-      "zip1 z27.d, z6.d, z31.d\n"
-      "zip2 z31.d, z6.d, z31.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
       "b 48f\n"
       "47:"  // Height 5: no accumulate
       "mov z8.s, #0x0\n"
@@ -1038,15 +1038,15 @@
       "49:"  // Height 5: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 50f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
       "cbnz x28, 51f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1057,231 +1057,231 @@
       "b 51f\n"
       "50:"  // Height 5: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
       "51:"  // Height 5: input setup done
       "cmp x27, #0x10\n"
       "ble 53f\n"
       "52:"  // Height 5: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "ld1rqb { z5.b }, p0/Z, [x22]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
-      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
-      ".inst 0x45c79898  // ummla z24.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1rqb { z6.b }, p0/Z, [x26]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z7.b }, p0/Z, [x24]\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z5.d, z6.d, z1.d\n"
+      "trn2 z6.d, z6.d, z1.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "trn1 z3.d, z7.d, z2.d\n"
+      "trn2 z7.d, z7.d, z2.d\n"
+      "ld1b { z1.b }, p5/Z, [x10]\n"
+      "trn1 z2.d, z4.d, z0.d\n"
+      "trn2 z4.d, z4.d, z0.d\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45c198a8  // ummla z8.s, z5.b, z1.b\n"
+      ".inst 0x45c19870  // ummla z16.s, z3.b, z1.b\n"
+      ".inst 0x45c19858  // ummla z24.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
-      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
+      ".inst 0x45c098ac  // ummla z12.s, z5.b, z0.b\n"
+      ".inst 0x45c09874  // ummla z20.s, z3.b, z0.b\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x45c6989c  // ummla z28.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      ".inst 0x45c0985c  // ummla z28.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45c198a9  // ummla z9.s, z5.b, z1.b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
-      ".inst 0x45c79899  // ummla z25.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45c19871  // ummla z17.s, z3.b, z1.b\n"
+      ".inst 0x45c19859  // ummla z25.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
-      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
+      ".inst 0x45c098ad  // ummla z13.s, z5.b, z0.b\n"
+      ".inst 0x45c09875  // ummla z21.s, z3.b, z0.b\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x45c6989d  // ummla z29.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
-      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
-      ".inst 0x45c7989a  // ummla z26.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
-      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
-      ".inst 0x45c6989e  // ummla z30.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45c0985d  // ummla z29.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45c198aa  // ummla z10.s, z5.b, z1.b\n"
+      ".inst 0x45c19872  // ummla z18.s, z3.b, z1.b\n"
+      ".inst 0x45c1985a  // ummla z26.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45c098ae  // ummla z14.s, z5.b, z0.b\n"
+      ".inst 0x45c09876  // ummla z22.s, z3.b, z0.b\n"
+      ".inst 0x45c0985e  // ummla z30.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
-      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
-      ".inst 0x45c7989b  // ummla z27.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
-      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
-      ".inst 0x45c6989f  // ummla z31.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
-      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
-      ".inst 0x45c798b8  // ummla z24.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
-      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
-      ".inst 0x45c698bc  // ummla z28.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
-      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
-      ".inst 0x45c798b9  // ummla z25.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
-      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
-      ".inst 0x45c698bd  // ummla z29.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
-      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
-      ".inst 0x45c798ba  // ummla z26.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
-      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
-      ".inst 0x45c698be  // ummla z30.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
-      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
-      ".inst 0x45c798bb  // ummla z27.s, z5.b, z7.b\n"
-      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
-      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
-      ".inst 0x45c698bf  // ummla z31.s, z5.b, z6.b\n"
+      ".inst 0x45c198ab  // ummla z11.s, z5.b, z1.b\n"
+      ".inst 0x45c19873  // ummla z19.s, z3.b, z1.b\n"
+      ".inst 0x45c1985b  // ummla z27.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x45c098af  // ummla z15.s, z5.b, z0.b\n"
+      ".inst 0x45c09877  // ummla z23.s, z3.b, z0.b\n"
+      ".inst 0x45c0985f  // ummla z31.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45c198c8  // ummla z8.s, z6.b, z1.b\n"
+      ".inst 0x45c198f0  // ummla z16.s, z7.b, z1.b\n"
+      ".inst 0x45c19898  // ummla z24.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x45c098cc  // ummla z12.s, z6.b, z0.b\n"
+      ".inst 0x45c098f4  // ummla z20.s, z7.b, z0.b\n"
+      ".inst 0x45c0989c  // ummla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45c198c9  // ummla z9.s, z6.b, z1.b\n"
+      ".inst 0x45c198f1  // ummla z17.s, z7.b, z1.b\n"
+      ".inst 0x45c19899  // ummla z25.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x45c098cd  // ummla z13.s, z6.b, z0.b\n"
+      ".inst 0x45c098f5  // ummla z21.s, z7.b, z0.b\n"
+      ".inst 0x45c0989d  // ummla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45c198ca  // ummla z10.s, z6.b, z1.b\n"
+      ".inst 0x45c198f2  // ummla z18.s, z7.b, z1.b\n"
+      ".inst 0x45c1989a  // ummla z26.s, z4.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x45c098ce  // ummla z14.s, z6.b, z0.b\n"
+      ".inst 0x45c098f6  // ummla z22.s, z7.b, z0.b\n"
+      ".inst 0x45c0989e  // ummla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x45c198cb  // ummla z11.s, z6.b, z1.b\n"
+      ".inst 0x45c198f3  // ummla z19.s, z7.b, z1.b\n"
+      ".inst 0x45c1989b  // ummla z27.s, z4.b, z1.b\n"
+      ".inst 0x45c098cf  // ummla z15.s, z6.b, z0.b\n"
+      ".inst 0x45c098f7  // ummla z23.s, z7.b, z0.b\n"
+      ".inst 0x45c0989f  // ummla z31.s, z4.b, z0.b\n"
       "bgt 52b\n"
       "53:"  // Height 5: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
+      "ld1rqb { z4.b }, p0/Z, [x25]\n"
       "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn1 z7.d, z1.d, z4.d\n"
+      "trn2 z1.d, z1.d, z4.d\n"
       "ld1rqb { z5.b }, p0/Z, [x22]\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
-      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
-      ".inst 0x45c79898  // ummla z24.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "trn1 z6.d, z3.d, z2.d\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45c298e8  // ummla z8.s, z7.b, z2.b\n"
+      ".inst 0x45c298d0  // ummla z16.s, z6.b, z2.b\n"
+      ".inst 0x45c29898  // ummla z24.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
-      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
-      ".inst 0x45c6989c  // ummla z28.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
-      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
-      ".inst 0x45c79899  // ummla z25.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
-      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
-      ".inst 0x45c6989d  // ummla z29.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
-      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
-      ".inst 0x45c7989a  // ummla z26.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
-      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
-      ".inst 0x45c6989e  // ummla z30.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c098ec  // ummla z12.s, z7.b, z0.b\n"
+      ".inst 0x45c098d4  // ummla z20.s, z6.b, z0.b\n"
+      ".inst 0x45c0989c  // ummla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45c298e9  // ummla z9.s, z7.b, z2.b\n"
+      ".inst 0x45c298d1  // ummla z17.s, z6.b, z2.b\n"
+      ".inst 0x45c29899  // ummla z25.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45c098ed  // ummla z13.s, z7.b, z0.b\n"
+      ".inst 0x45c098d5  // ummla z21.s, z6.b, z0.b\n"
+      ".inst 0x45c0989d  // ummla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45c298ea  // ummla z10.s, z7.b, z2.b\n"
+      ".inst 0x45c298d2  // ummla z18.s, z6.b, z2.b\n"
+      ".inst 0x45c2989a  // ummla z26.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45c098ee  // ummla z14.s, z7.b, z0.b\n"
+      ".inst 0x45c098d6  // ummla z22.s, z6.b, z0.b\n"
+      ".inst 0x45c0989e  // ummla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45c298eb  // ummla z11.s, z7.b, z2.b\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
-      ".inst 0x45c7989b  // ummla z27.s, z4.b, z7.b\n"
-      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
-      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
-      ".inst 0x45c6989f  // ummla z31.s, z4.b, z6.b\n"
+      ".inst 0x45c298d3  // ummla z19.s, z6.b, z2.b\n"
+      ".inst 0x45c2989b  // ummla z27.s, z4.b, z2.b\n"
+      ".inst 0x45c098ef  // ummla z15.s, z7.b, z0.b\n"
+      ".inst 0x45c098d7  // ummla z23.s, z6.b, z0.b\n"
+      ".inst 0x45c0989f  // ummla z31.s, z4.b, z0.b\n"
       "ble 54f\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
-      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
-      ".inst 0x45c798b8  // ummla z24.s, z5.b, z7.b\n"
-      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
-      ".inst 0x45c698bc  // ummla z28.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
-      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
-      ".inst 0x45c798b9  // ummla z25.s, z5.b, z7.b\n"
-      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
-      ".inst 0x45c698bd  // ummla z29.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
-      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
-      ".inst 0x45c798ba  // ummla z26.s, z5.b, z7.b\n"
-      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
-      ".inst 0x45c698be  // ummla z30.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45c29828  // ummla z8.s, z1.b, z2.b\n"
+      ".inst 0x45c29870  // ummla z16.s, z3.b, z2.b\n"
+      ".inst 0x45c298b8  // ummla z24.s, z5.b, z2.b\n"
+      ".inst 0x45c0982c  // ummla z12.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x45c09874  // ummla z20.s, z3.b, z0.b\n"
+      ".inst 0x45c098bc  // ummla z28.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45c29829  // ummla z9.s, z1.b, z2.b\n"
+      ".inst 0x45c29871  // ummla z17.s, z3.b, z2.b\n"
+      ".inst 0x45c298b9  // ummla z25.s, z5.b, z2.b\n"
+      ".inst 0x45c0982d  // ummla z13.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45c09875  // ummla z21.s, z3.b, z0.b\n"
+      ".inst 0x45c098bd  // ummla z29.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45c2982a  // ummla z10.s, z1.b, z2.b\n"
+      ".inst 0x45c29872  // ummla z18.s, z3.b, z2.b\n"
+      ".inst 0x45c298ba  // ummla z26.s, z5.b, z2.b\n"
+      ".inst 0x45c0982e  // ummla z14.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45c09876  // ummla z22.s, z3.b, z0.b\n"
+      ".inst 0x45c098be  // ummla z30.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
-      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
-      ".inst 0x45c798bb  // ummla z27.s, z5.b, z7.b\n"
-      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
-      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
-      ".inst 0x45c698bf  // ummla z31.s, z5.b, z6.b\n"
+      ".inst 0x45c2982b  // ummla z11.s, z1.b, z2.b\n"
+      ".inst 0x45c29873  // ummla z19.s, z3.b, z2.b\n"
+      ".inst 0x45c298bb  // ummla z27.s, z5.b, z2.b\n"
+      ".inst 0x45c0982f  // ummla z15.s, z1.b, z0.b\n"
+      ".inst 0x45c09877  // ummla z23.s, z3.b, z0.b\n"
+      ".inst 0x45c098bf  // ummla z31.s, z5.b, z0.b\n"
       "54:"  // Height 5: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
       "cmp x28, x20\n"
       "bne 49b\n"
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
-      "add x24, x9, x20, LSL #2\n"
-      "add x23, x24, x20, LSL #2\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
+      "add x23, x9, x20, LSL #2\n"
       "add x22, x23, x20, LSL #2\n"
+      "uzp1 z2.d, z8.d, z12.d\n"
       "add x21, x22, x20, LSL #2\n"
+      "add x20, x21, x20, LSL #2\n"
       "uzp2 z8.d, z8.d, z12.d\n"
-      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp1 z1.d, z9.d, z13.d\n"
       "uzp2 z9.d, z9.d, z13.d\n"
-      "uzp1 z13.d, z10.d, z14.d\n"
-      "st1w { z7.s }, p4, [x9]\n"
+      "uzp1 z0.d, z10.d, z14.d\n"
+      "st1w { z2.s }, p4, [x9]\n"
       "uzp2 z10.d, z10.d, z14.d\n"
-      "uzp1 z14.d, z11.d, z15.d\n"
-      "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
+      "uzp1 z2.d, z11.d, z15.d\n"
+      "st1w { z1.s }, p3, [x9, #1, MUL VL]\n"
       "uzp2 z11.d, z11.d, z15.d\n"
-      "uzp1 z15.d, z16.d, z20.d\n"
-      "st1w { z13.s }, p2, [x9, #2, MUL VL]\n"
+      "uzp1 z1.d, z16.d, z20.d\n"
+      "st1w { z0.s }, p2, [x9, #2, MUL VL]\n"
       "uzp2 z16.d, z16.d, z20.d\n"
-      "uzp1 z20.d, z17.d, z21.d\n"
-      "st1w { z14.s }, p1, [x9, #3, MUL VL]\n"
+      "uzp1 z0.d, z17.d, z21.d\n"
+      "st1w { z2.s }, p1, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
       "uzp2 z17.d, z17.d, z21.d\n"
       "uzp1 z21.d, z18.d, z22.d\n"
-      "st1w { z8.s }, p4, [x24]\n"
+      "st1w { z8.s }, p4, [x23]\n"
       "uzp2 z18.d, z18.d, z22.d\n"
-      "uzp1 z22.d, z19.d, z23.d\n"
-      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "uzp1 z20.d, z19.d, z23.d\n"
+      "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
       "uzp2 z19.d, z19.d, z23.d\n"
       "uzp1 z24.d, z24.d, z28.d\n"
-      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
       "uzp1 z25.d, z25.d, z29.d\n"
       "uzp1 z26.d, z26.d, z30.d\n"
-      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
       "uzp1 z27.d, z27.d, z31.d\n"
-      "st1w { z15.s }, p4, [x23]\n"
-      "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
-      "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
-      "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
-      "st1w { z16.s }, p4, [x22]\n"
-      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
-      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
-      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
-      "st1w { z24.s }, p4, [x21]\n"
-      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
-      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z1.s }, p4, [x22]\n"
+      "st1w { z0.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z20.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x21]\n"
+      "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x20]\n"
+      "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
       "55:"  // Height 5: Writeback done
       "decw x11, ALL, MUL #4\n"
       "cmp x11, XZR\n"
@@ -1307,26 +1307,26 @@
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x24, x9, x20, LSL #2\n"
       "add x23, x24, x20, LSL #2\n"
-      "ld1w { z9.s }, p4/Z, [x9]\n"
+      "ld1w { z17.s }, p4/Z, [x9]\n"
       "add x22, x23, x20, LSL #2\n"
       "add x21, x22, x20, LSL #2\n"
-      "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n"
-      "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
       "add x20, x21, x20, LSL #2\n"
       "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n"
       "ld1w { z12.s }, p4/Z, [x24]\n"
-      "zip1 z8.d, z9.d, z12.d\n"
+      "zip1 z8.d, z17.d, z12.d\n"
       "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
       "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
-      "zip2 z12.d, z9.d, z12.d\n"
-      "zip1 z9.d, z10.d, z13.d\n"
+      "zip2 z12.d, z17.d, z12.d\n"
+      "zip1 z9.d, z18.d, z13.d\n"
       "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
       "ld1w { z17.s }, p4/Z, [x23]\n"
-      "zip2 z13.d, z10.d, z13.d\n"
-      "zip1 z10.d, z11.d, z14.d\n"
+      "zip2 z13.d, z18.d, z13.d\n"
+      "zip1 z10.d, z20.d, z14.d\n"
       "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
       "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
-      "zip2 z14.d, z11.d, z14.d\n"
+      "zip2 z14.d, z20.d, z14.d\n"
       "zip1 z11.d, z16.d, z15.d\n"
       "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
       "ld1w { z20.s }, p4/Z, [x22]\n"
@@ -1344,7 +1344,7 @@
       "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
       "zip2 z22.d, z19.d, z22.d\n"
       "zip1 z19.d, z24.d, z23.d\n"
-      "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n"
       "ld1w { z28.s }, p4/Z, [x20]\n"
       "zip2 z23.d, z24.d, z23.d\n"
       "zip1 z24.d, z25.d, z28.d\n"
@@ -1356,8 +1356,8 @@
       "zip2 z29.d, z26.d, z29.d\n"
       "zip1 z26.d, z27.d, z30.d\n"
       "zip2 z30.d, z27.d, z30.d\n"
-      "zip1 z27.d, z6.d, z31.d\n"
-      "zip2 z31.d, z6.d, z31.d\n"
+      "zip1 z27.d, z0.d, z31.d\n"
+      "zip2 z31.d, z0.d, z31.d\n"
       "b 59f\n"
       "58:"  // Height 6: no accumulate
       "mov z8.s, #0x0\n"
@@ -1389,16 +1389,16 @@
       "60:"  // Height 6: String loop
       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
       "ldr w27, [x20, x28, LSL #0x2]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
       "tbz %x[flags], #3, 61f\n"
-      "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n"
-      "add x21, x21, x20, LSL #3\n"
-      "ldr x26, [x21, #0x0]\n"
-      "ldr x25, [x21, #0x8]\n"
-      "ldr x24, [x21, #0x10]\n"
-      "ldr x23, [x21, #0x18]\n"
-      "ldr x22, [x21, #0x20]\n"
-      "ldr x21, [x21, #0x28]\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x21, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x25, [x20, #0x8]\n"
+      "ldr x24, [x20, #0x10]\n"
+      "ldr x23, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x21, [x20, #0x28]\n"
       "cbnz x28, 62f\n"
       "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
       "add x26, x26, x20\n"
@@ -1410,184 +1410,184 @@
       "b 62f\n"
       "61:"  // Height 6: setup direct input
       "mov x26, %x[input_ptr]\n"
-      "add x25, x26, x20\n"
-      "add x24, x25, x20\n"
-      "add x23, x24, x20\n"
-      "add x22, x23, x20\n"
-      "add x21, x22, x20\n"
+      "add x25, x26, x21\n"
+      "add x24, x25, x21\n"
+      "add x23, x24, x21\n"
+      "add x22, x23, x21\n"
+      "add x21, x22, x21\n"
       "62:"  // Height 6: input setup done
       "cmp x27, #0x10\n"
       "ble 64f\n"
       "63:"  // Height 6: Multiply loop: Main loop head
       "whilelt p0.b, XZR, x27\n"
-      "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
-      "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
-      "ld1rqb { z5.b }, p0/Z, [x22]\n"
-      "ld1rqb { z6.b }, p0/Z, [x21]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
-      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
-      ".inst 0x45c79898  // ummla z24.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1rqb { z7.b }, p0/Z, [x26]\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
+      "trn1 z6.d, z7.d, z0.d\n"
+      "ld1rqb { z5.b }, p0/Z, [x24]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "trn2 z7.d, z7.d, z0.d\n"
+      "trn1 z4.d, z5.d, z1.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "ld1rqb { z0.b }, p0/Z, [x21]\n"
+      "trn2 z5.d, z5.d, z1.d\n"
+      "trn1 z2.d, z3.d, z0.d\n"
+      "trn2 z3.d, z3.d, z0.d\n"
+      "ld1b { z1.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45c198c8  // ummla z8.s, z6.b, z1.b\n"
+      ".inst 0x45c19890  // ummla z16.s, z4.b, z1.b\n"
+      ".inst 0x45c19858  // ummla z24.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
       "sub x27, x27, #0x10\n"
-      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
-      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
+      ".inst 0x45c098cc  // ummla z12.s, z6.b, z0.b\n"
+      ".inst 0x45c09894  // ummla z20.s, z4.b, z0.b\n"
       "cmp x27, #0x10\n"
       "add x26, x26, #0x10\n"
-      ".inst 0x45c6989c  // ummla z28.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      ".inst 0x45c0985c  // ummla z28.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45c198c9  // ummla z9.s, z6.b, z1.b\n"
       "add x25, x25, #0x10\n"
-      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
-      ".inst 0x45c79899  // ummla z25.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45c19891  // ummla z17.s, z4.b, z1.b\n"
+      ".inst 0x45c19859  // ummla z25.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
       "add x24, x24, #0x10\n"
-      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
-      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
+      ".inst 0x45c098cd  // ummla z13.s, z6.b, z0.b\n"
+      ".inst 0x45c09895  // ummla z21.s, z4.b, z0.b\n"
       "add x23, x23, #0x10\n"
       "add x22, x22, #0x10\n"
-      ".inst 0x45c6989d  // ummla z29.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
+      ".inst 0x45c0985d  // ummla z29.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45c198ca  // ummla z10.s, z6.b, z1.b\n"
       "add x21, x21, #0x10\n"
-      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
-      ".inst 0x45c7989a  // ummla z26.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
-      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
-      ".inst 0x45c6989e  // ummla z30.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45c19892  // ummla z18.s, z4.b, z1.b\n"
+      ".inst 0x45c1985a  // ummla z26.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45c098ce  // ummla z14.s, z6.b, z0.b\n"
+      ".inst 0x45c09896  // ummla z22.s, z4.b, z0.b\n"
+      ".inst 0x45c0985e  // ummla z30.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #16\n"
-      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
-      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
-      ".inst 0x45c7989b  // ummla z27.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n"
-      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
-      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
-      ".inst 0x45c6989f  // ummla z31.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n"
-      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
-      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
-      ".inst 0x45c798b8  // ummla z24.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n"
-      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
-      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
-      ".inst 0x45c698bc  // ummla z28.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n"
-      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
-      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
-      ".inst 0x45c798b9  // ummla z25.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n"
-      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
-      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
-      ".inst 0x45c698bd  // ummla z29.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n"
-      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
-      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
-      ".inst 0x45c798ba  // ummla z26.s, z5.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n"
-      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
-      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
-      ".inst 0x45c698be  // ummla z30.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n"
-      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
-      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
-      ".inst 0x45c798bb  // ummla z27.s, z5.b, z7.b\n"
-      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
-      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
-      ".inst 0x45c698bf  // ummla z31.s, z5.b, z6.b\n"
+      ".inst 0x45c198cb  // ummla z11.s, z6.b, z1.b\n"
+      ".inst 0x45c19893  // ummla z19.s, z4.b, z1.b\n"
+      ".inst 0x45c1985b  // ummla z27.s, z2.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x45c098cf  // ummla z15.s, z6.b, z0.b\n"
+      ".inst 0x45c09897  // ummla z23.s, z4.b, z0.b\n"
+      ".inst 0x45c0985f  // ummla z31.s, z2.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x45c198e8  // ummla z8.s, z7.b, z1.b\n"
+      ".inst 0x45c198b0  // ummla z16.s, z5.b, z1.b\n"
+      ".inst 0x45c19878  // ummla z24.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x45c098ec  // ummla z12.s, z7.b, z0.b\n"
+      ".inst 0x45c098b4  // ummla z20.s, z5.b, z0.b\n"
+      ".inst 0x45c0987c  // ummla z28.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x45c198e9  // ummla z9.s, z7.b, z1.b\n"
+      ".inst 0x45c198b1  // ummla z17.s, z5.b, z1.b\n"
+      ".inst 0x45c19879  // ummla z25.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x45c098ed  // ummla z13.s, z7.b, z0.b\n"
+      ".inst 0x45c098b5  // ummla z21.s, z5.b, z0.b\n"
+      ".inst 0x45c0987d  // ummla z29.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x45c198ea  // ummla z10.s, z7.b, z1.b\n"
+      ".inst 0x45c198b2  // ummla z18.s, z5.b, z1.b\n"
+      ".inst 0x45c1987a  // ummla z26.s, z3.b, z1.b\n"
+      "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x45c098ee  // ummla z14.s, z7.b, z0.b\n"
+      ".inst 0x45c098b6  // ummla z22.s, z5.b, z0.b\n"
+      ".inst 0x45c0987e  // ummla z30.s, z3.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x45c198eb  // ummla z11.s, z7.b, z1.b\n"
+      ".inst 0x45c198b3  // ummla z19.s, z5.b, z1.b\n"
+      ".inst 0x45c1987b  // ummla z27.s, z3.b, z1.b\n"
+      ".inst 0x45c098ef  // ummla z15.s, z7.b, z0.b\n"
+      ".inst 0x45c098b7  // ummla z23.s, z5.b, z0.b\n"
+      ".inst 0x45c0987f  // ummla z31.s, z3.b, z0.b\n"
       "bgt 63b\n"
       "64:"  // Height 6: Multiply loop: Single iteration only
       "whilelt p0.b, XZR, x27\n"
       "ld1rqb { z1.b }, p0/Z, [x26]\n"
-      "ld1rqb { z2.b }, p0/Z, [x25]\n"
-      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z0.b }, p0/Z, [x25]\n"
+      "trn1 z7.d, z1.d, z0.d\n"
       "ld1rqb { z3.b }, p0/Z, [x24]\n"
-      "ld1rqb { z4.b }, p0/Z, [x23]\n"
-      "trn2 z1.d, z1.d, z2.d\n"
-      "trn1 z2.d, z3.d, z4.d\n"
+      "ld1rqb { z2.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z0.d\n"
+      "trn1 z6.d, z3.d, z2.d\n"
       "ld1rqb { z5.b }, p0/Z, [x22]\n"
-      "ld1rqb { z6.b }, p0/Z, [x21]\n"
-      "trn2 z3.d, z3.d, z4.d\n"
-      "trn1 z4.d, z5.d, z6.d\n"
-      "trn2 z5.d, z5.d, z6.d\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
-      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
-      ".inst 0x45c79898  // ummla z24.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x21]\n"
+      "trn2 z3.d, z3.d, z2.d\n"
+      "trn1 z4.d, z5.d, z0.d\n"
+      "trn2 z5.d, z5.d, z0.d\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45c298e8  // ummla z8.s, z7.b, z2.b\n"
+      ".inst 0x45c298d0  // ummla z16.s, z6.b, z2.b\n"
+      ".inst 0x45c29898  // ummla z24.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
       "subs x27, x27, #0x8\n"
-      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
-      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
-      ".inst 0x45c6989c  // ummla z28.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
-      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
-      ".inst 0x45c79899  // ummla z25.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
-      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
-      ".inst 0x45c6989d  // ummla z29.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
-      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
-      ".inst 0x45c7989a  // ummla z26.s, z4.b, z7.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
-      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
-      ".inst 0x45c6989e  // ummla z30.s, z4.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
-      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c098ec  // ummla z12.s, z7.b, z0.b\n"
+      ".inst 0x45c098d4  // ummla z20.s, z6.b, z0.b\n"
+      ".inst 0x45c0989c  // ummla z28.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45c298e9  // ummla z9.s, z7.b, z2.b\n"
+      ".inst 0x45c298d1  // ummla z17.s, z6.b, z2.b\n"
+      ".inst 0x45c29899  // ummla z25.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45c098ed  // ummla z13.s, z7.b, z0.b\n"
+      ".inst 0x45c098d5  // ummla z21.s, z6.b, z0.b\n"
+      ".inst 0x45c0989d  // ummla z29.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45c298ea  // ummla z10.s, z7.b, z2.b\n"
+      ".inst 0x45c298d2  // ummla z18.s, z6.b, z2.b\n"
+      ".inst 0x45c2989a  // ummla z26.s, z4.b, z2.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45c098ee  // ummla z14.s, z7.b, z0.b\n"
+      ".inst 0x45c098d6  // ummla z22.s, z6.b, z0.b\n"
+      ".inst 0x45c0989e  // ummla z30.s, z4.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
+      ".inst 0x45c298eb  // ummla z11.s, z7.b, z2.b\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
-      ".inst 0x45c7989b  // ummla z27.s, z4.b, z7.b\n"
-      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
-      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
-      ".inst 0x45c6989f  // ummla z31.s, z4.b, z6.b\n"
+      ".inst 0x45c298d3  // ummla z19.s, z6.b, z2.b\n"
+      ".inst 0x45c2989b  // ummla z27.s, z4.b, z2.b\n"
+      ".inst 0x45c098ef  // ummla z15.s, z7.b, z0.b\n"
+      ".inst 0x45c098d7  // ummla z23.s, z6.b, z0.b\n"
+      ".inst 0x45c0989f  // ummla z31.s, z4.b, z0.b\n"
       "ble 65f\n"
-      "ld1b { z7.b }, p5/Z, [x10]\n"
-      "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
-      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
-      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
-      ".inst 0x45c798b8  // ummla z24.s, z5.b, z7.b\n"
-      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
-      ".inst 0x45c698bc  // ummla z28.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
-      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
-      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
-      ".inst 0x45c798b9  // ummla z25.s, z5.b, z7.b\n"
-      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n"
-      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
-      ".inst 0x45c698bd  // ummla z29.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n"
-      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
-      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
-      ".inst 0x45c798ba  // ummla z26.s, z5.b, z7.b\n"
-      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
-      "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n"
-      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
-      ".inst 0x45c698be  // ummla z30.s, z5.b, z6.b\n"
-      "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n"
+      "ld1b { z2.b }, p5/Z, [x10]\n"
+      "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x45c29828  // ummla z8.s, z1.b, z2.b\n"
+      ".inst 0x45c29870  // ummla z16.s, z3.b, z2.b\n"
+      ".inst 0x45c298b8  // ummla z24.s, z5.b, z2.b\n"
+      ".inst 0x45c0982c  // ummla z12.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x45c09874  // ummla z20.s, z3.b, z0.b\n"
+      ".inst 0x45c098bc  // ummla z28.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x45c29829  // ummla z9.s, z1.b, z2.b\n"
+      ".inst 0x45c29871  // ummla z17.s, z3.b, z2.b\n"
+      ".inst 0x45c298b9  // ummla z25.s, z5.b, z2.b\n"
+      ".inst 0x45c0982d  // ummla z13.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x45c09875  // ummla z21.s, z3.b, z0.b\n"
+      ".inst 0x45c098bd  // ummla z29.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x45c2982a  // ummla z10.s, z1.b, z2.b\n"
+      ".inst 0x45c29872  // ummla z18.s, z3.b, z2.b\n"
+      ".inst 0x45c298ba  // ummla z26.s, z5.b, z2.b\n"
+      ".inst 0x45c0982e  // ummla z14.s, z1.b, z0.b\n"
+      "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x45c09876  // ummla z22.s, z3.b, z0.b\n"
+      ".inst 0x45c098be  // ummla z30.s, z5.b, z0.b\n"
+      "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
       "addvl x10, x10, #8\n"
-      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
-      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
-      ".inst 0x45c798bb  // ummla z27.s, z5.b, z7.b\n"
-      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
-      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
-      ".inst 0x45c698bf  // ummla z31.s, z5.b, z6.b\n"
+      ".inst 0x45c2982b  // ummla z11.s, z1.b, z2.b\n"
+      ".inst 0x45c29873  // ummla z19.s, z3.b, z2.b\n"
+      ".inst 0x45c298bb  // ummla z27.s, z5.b, z2.b\n"
+      ".inst 0x45c0982f  // ummla z15.s, z1.b, z0.b\n"
+      ".inst 0x45c09877  // ummla z23.s, z3.b, z0.b\n"
+      ".inst 0x45c098bf  // ummla z31.s, z5.b, z0.b\n"
       "65:"  // Height 6: Multiply loop: multiply skip
       "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
       "add x28, x28, #0x1\n"
@@ -1596,7 +1596,7 @@
       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "add x24, x9, x20, LSL #2\n"
       "add x23, x24, x20, LSL #2\n"
-      "uzp1 z7.d, z8.d, z12.d\n"
+      "uzp1 z0.d, z8.d, z12.d\n"
       "add x22, x23, x20, LSL #2\n"
       "add x21, x22, x20, LSL #2\n"
       "uzp2 z8.d, z8.d, z12.d\n"
@@ -1604,7 +1604,7 @@
       "add x20, x21, x20, LSL #2\n"
       "uzp2 z9.d, z9.d, z13.d\n"
       "uzp1 z13.d, z10.d, z14.d\n"
-      "st1w { z7.s }, p4, [x9]\n"
+      "st1w { z0.s }, p4, [x9]\n"
       "uzp2 z10.d, z10.d, z14.d\n"
       "uzp1 z14.d, z11.d, z15.d\n"
       "st1w { z12.s }, p3, [x9, #1, MUL VL]\n"
@@ -1664,7 +1664,6 @@
       "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
       "b 1b\n"
       "68:"  // Exit
-
       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
       : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -1672,4 +1671,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif  // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
index f5fdf99..1ae035c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../bfloat.hpp"
 #include "../performance_parameters.hpp"
@@ -56,11 +56,6 @@
         return get_vector_length<float>() * 3;
     }
 
-    static unsigned int stripe_width()
-    {
-        return get_vector_length<float>();
-    }
-
     static constexpr unsigned int k_unroll()
     {
         return 2;
@@ -97,5 +92,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
index 9445292..e507bc5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void sve_interleaved_bf16fp32_dot_8x3VL(
-    const bfloat16 *Apanel, const bfloat16 *Bpanel,
-    float *Cpanel, int ablocks, int bblocks, int K) {
+    const bfloat16 *Apanel,
+    const bfloat16 *Bpanel,
+    float *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -85,10 +89,10 @@
       "3:"  // main loop head
       ".inst 0x64604088  // bfdot z8.s, z4.h, z0.h[0]\n"
       ".inst 0x6468408b  // bfdot z11.s, z4.h, z0.h[1]\n"
-      "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #32]\n"
       ".inst 0x6470408e  // bfdot z14.s, z4.h, z0.h[2]\n"
       ".inst 0x64784091  // bfdot z17.s, z4.h, z0.h[3]\n"
-      "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #48]\n"
+      "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #48]\n"
       ".inst 0x64614094  // bfdot z20.s, z4.h, z1.h[0]\n"
       ".inst 0x64694097  // bfdot z23.s, z4.h, z1.h[1]\n"
       "sub x20, x20, #0x2\n"
@@ -115,35 +119,35 @@
       ".inst 0x646940d9  // bfdot z25.s, z6.h, z1.h[1]\n"
       ".inst 0x647140dc  // bfdot z28.s, z6.h, z1.h[2]\n"
       ".inst 0x647940df  // bfdot z31.s, z6.h, z1.h[3]\n"
-      "ld1h { z6.h }, p0/Z, [x22, #5, MUL VL]\n"
+      "ld1h { z2.h }, p0/Z, [x22, #5, MUL VL]\n"
       "addvl x22, x22, #6\n"
-      ".inst 0x64624088  // bfdot z8.s, z4.h, z2.h[0]\n"
-      ".inst 0x646a408b  // bfdot z11.s, z4.h, z2.h[1]\n"
+      ".inst 0x64634088  // bfdot z8.s, z4.h, z3.h[0]\n"
+      ".inst 0x646b408b  // bfdot z11.s, z4.h, z3.h[1]\n"
       "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
-      ".inst 0x6472408e  // bfdot z14.s, z4.h, z2.h[2]\n"
-      ".inst 0x647a4091  // bfdot z17.s, z4.h, z2.h[3]\n"
-      ".inst 0x64634094  // bfdot z20.s, z4.h, z3.h[0]\n"
-      ".inst 0x646b4097  // bfdot z23.s, z4.h, z3.h[1]\n"
-      ".inst 0x6473409a  // bfdot z26.s, z4.h, z3.h[2]\n"
-      ".inst 0x647b409d  // bfdot z29.s, z4.h, z3.h[3]\n"
+      ".inst 0x6473408e  // bfdot z14.s, z4.h, z3.h[2]\n"
+      ".inst 0x647b4091  // bfdot z17.s, z4.h, z3.h[3]\n"
+      ".inst 0x64674094  // bfdot z20.s, z4.h, z7.h[0]\n"
+      ".inst 0x646f4097  // bfdot z23.s, z4.h, z7.h[1]\n"
+      ".inst 0x6477409a  // bfdot z26.s, z4.h, z7.h[2]\n"
+      ".inst 0x647f409d  // bfdot z29.s, z4.h, z7.h[3]\n"
       "ld1h { z4.h }, p0/Z, [x22]\n"
-      ".inst 0x646240a9  // bfdot z9.s, z5.h, z2.h[0]\n"
-      ".inst 0x646a40ac  // bfdot z12.s, z5.h, z2.h[1]\n"
-      ".inst 0x647240af  // bfdot z15.s, z5.h, z2.h[2]\n"
-      ".inst 0x647a40b2  // bfdot z18.s, z5.h, z2.h[3]\n"
-      ".inst 0x646340b5  // bfdot z21.s, z5.h, z3.h[0]\n"
-      ".inst 0x646b40b8  // bfdot z24.s, z5.h, z3.h[1]\n"
-      ".inst 0x647340bb  // bfdot z27.s, z5.h, z3.h[2]\n"
-      ".inst 0x647b40be  // bfdot z30.s, z5.h, z3.h[3]\n"
+      ".inst 0x646340a9  // bfdot z9.s, z5.h, z3.h[0]\n"
+      ".inst 0x646b40ac  // bfdot z12.s, z5.h, z3.h[1]\n"
+      ".inst 0x647340af  // bfdot z15.s, z5.h, z3.h[2]\n"
+      ".inst 0x647b40b2  // bfdot z18.s, z5.h, z3.h[3]\n"
+      ".inst 0x646740b5  // bfdot z21.s, z5.h, z7.h[0]\n"
+      ".inst 0x646f40b8  // bfdot z24.s, z5.h, z7.h[1]\n"
+      ".inst 0x647740bb  // bfdot z27.s, z5.h, z7.h[2]\n"
+      ".inst 0x647f40be  // bfdot z30.s, z5.h, z7.h[3]\n"
       "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
-      ".inst 0x646240ca  // bfdot z10.s, z6.h, z2.h[0]\n"
-      ".inst 0x646a40cd  // bfdot z13.s, z6.h, z2.h[1]\n"
-      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
-      ".inst 0x647a40d3  // bfdot z19.s, z6.h, z2.h[3]\n"
-      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
-      ".inst 0x646b40d9  // bfdot z25.s, z6.h, z3.h[1]\n"
-      ".inst 0x647340dc  // bfdot z28.s, z6.h, z3.h[2]\n"
-      ".inst 0x647b40df  // bfdot z31.s, z6.h, z3.h[3]\n"
+      ".inst 0x6463404a  // bfdot z10.s, z2.h, z3.h[0]\n"
+      ".inst 0x646b404d  // bfdot z13.s, z2.h, z3.h[1]\n"
+      ".inst 0x64734050  // bfdot z16.s, z2.h, z3.h[2]\n"
+      ".inst 0x647b4053  // bfdot z19.s, z2.h, z3.h[3]\n"
+      ".inst 0x64674056  // bfdot z22.s, z2.h, z7.h[0]\n"
+      ".inst 0x646f4059  // bfdot z25.s, z2.h, z7.h[1]\n"
+      ".inst 0x6477405c  // bfdot z28.s, z2.h, z7.h[2]\n"
+      ".inst 0x647f405f  // bfdot z31.s, z2.h, z7.h[3]\n"
       "ld1h { z6.h }, p0/Z, [x22, #2, MUL VL]\n"
       "bge 3b\n"
       "4:"  // main loop skip
@@ -174,37 +178,37 @@
       ".inst 0x647140dc  // bfdot z28.s, z6.h, z1.h[2]\n"
       ".inst 0x647940df  // bfdot z31.s, z6.h, z1.h[3]\n"
       "cbz x20, 5f\n"
-      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
-      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rqh { z4.h }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #16]\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "ld1h { z7.h }, p0/Z, [x22]\n"
-      "ld1h { z4.h }, p0/Z, [x22, #1, MUL VL]\n"
-      ".inst 0x646040e8  // bfdot z8.s, z7.h, z0.h[0]\n"
-      "ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n"
-      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
-      ".inst 0x647040ee  // bfdot z14.s, z7.h, z0.h[2]\n"
-      ".inst 0x647840f1  // bfdot z17.s, z7.h, z0.h[3]\n"
-      ".inst 0x646140f4  // bfdot z20.s, z7.h, z1.h[0]\n"
+      "ld1h { z2.h }, p0/Z, [x22]\n"
+      "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x64644048  // bfdot z8.s, z2.h, z4.h[0]\n"
+      "ld1h { z0.h }, p0/Z, [x22, #2, MUL VL]\n"
+      ".inst 0x646c404b  // bfdot z11.s, z2.h, z4.h[1]\n"
+      ".inst 0x6474404e  // bfdot z14.s, z2.h, z4.h[2]\n"
+      ".inst 0x647c4051  // bfdot z17.s, z2.h, z4.h[3]\n"
+      ".inst 0x64634054  // bfdot z20.s, z2.h, z3.h[0]\n"
       "addvl x22, x22, #3\n"
-      ".inst 0x646940f7  // bfdot z23.s, z7.h, z1.h[1]\n"
-      ".inst 0x647140fa  // bfdot z26.s, z7.h, z1.h[2]\n"
-      ".inst 0x647940fd  // bfdot z29.s, z7.h, z1.h[3]\n"
-      ".inst 0x64604089  // bfdot z9.s, z4.h, z0.h[0]\n"
-      ".inst 0x6468408c  // bfdot z12.s, z4.h, z0.h[1]\n"
-      ".inst 0x6470408f  // bfdot z15.s, z4.h, z0.h[2]\n"
-      ".inst 0x64784092  // bfdot z18.s, z4.h, z0.h[3]\n"
-      ".inst 0x64614095  // bfdot z21.s, z4.h, z1.h[0]\n"
-      ".inst 0x64694098  // bfdot z24.s, z4.h, z1.h[1]\n"
-      ".inst 0x6471409b  // bfdot z27.s, z4.h, z1.h[2]\n"
-      ".inst 0x6479409e  // bfdot z30.s, z4.h, z1.h[3]\n"
-      ".inst 0x646040aa  // bfdot z10.s, z5.h, z0.h[0]\n"
-      ".inst 0x646840ad  // bfdot z13.s, z5.h, z0.h[1]\n"
-      ".inst 0x647040b0  // bfdot z16.s, z5.h, z0.h[2]\n"
-      ".inst 0x647840b3  // bfdot z19.s, z5.h, z0.h[3]\n"
-      ".inst 0x646140b6  // bfdot z22.s, z5.h, z1.h[0]\n"
-      ".inst 0x646940b9  // bfdot z25.s, z5.h, z1.h[1]\n"
-      ".inst 0x647140bc  // bfdot z28.s, z5.h, z1.h[2]\n"
-      ".inst 0x647940bf  // bfdot z31.s, z5.h, z1.h[3]\n"
+      ".inst 0x646b4057  // bfdot z23.s, z2.h, z3.h[1]\n"
+      ".inst 0x6473405a  // bfdot z26.s, z2.h, z3.h[2]\n"
+      ".inst 0x647b405d  // bfdot z29.s, z2.h, z3.h[3]\n"
+      ".inst 0x64644029  // bfdot z9.s, z1.h, z4.h[0]\n"
+      ".inst 0x646c402c  // bfdot z12.s, z1.h, z4.h[1]\n"
+      ".inst 0x6474402f  // bfdot z15.s, z1.h, z4.h[2]\n"
+      ".inst 0x647c4032  // bfdot z18.s, z1.h, z4.h[3]\n"
+      ".inst 0x64634035  // bfdot z21.s, z1.h, z3.h[0]\n"
+      ".inst 0x646b4038  // bfdot z24.s, z1.h, z3.h[1]\n"
+      ".inst 0x6473403b  // bfdot z27.s, z1.h, z3.h[2]\n"
+      ".inst 0x647b403e  // bfdot z30.s, z1.h, z3.h[3]\n"
+      ".inst 0x6464400a  // bfdot z10.s, z0.h, z4.h[0]\n"
+      ".inst 0x646c400d  // bfdot z13.s, z0.h, z4.h[1]\n"
+      ".inst 0x64744010  // bfdot z16.s, z0.h, z4.h[2]\n"
+      ".inst 0x647c4013  // bfdot z19.s, z0.h, z4.h[3]\n"
+      ".inst 0x64634016  // bfdot z22.s, z0.h, z3.h[0]\n"
+      ".inst 0x646b4019  // bfdot z25.s, z0.h, z3.h[1]\n"
+      ".inst 0x6473401c  // bfdot z28.s, z0.h, z3.h[2]\n"
+      ".inst 0x647b401f  // bfdot z31.s, z0.h, z3.h[3]\n"
       "5:"  // multiply loop done
       "st1w { z8.s }, p0, [%x[Cpanel]]\n"
       "subs x23, x23, #0x1\n"
@@ -243,4 +247,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
index 1de8c68..c5096ff 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../bfloat.hpp"
 #include "../performance_parameters.hpp"
@@ -56,11 +56,6 @@
         return get_vector_length<float>() * 3;
     }
 
-    static unsigned int stripe_width()
-    {
-        return get_vector_length<float>();
-    }
-
     static constexpr unsigned int k_unroll()
     {
         return 4;
@@ -109,5 +104,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
index fe5382d..ba71857 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void sve_interleaved_bf16fp32_mmla_8x3VL(
-    const bfloat16 *Apanel, const bfloat16 *Bpanel,
-    float *Cpanel, int ablocks, int bblocks, int K) {
+    const bfloat16 *Apanel,
+    const bfloat16 *Bpanel,
+    float *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -85,82 +89,82 @@
       "mov z31.b, #0x0\n"
       "blt 4f\n"
       "3:"  // main loop head
-      "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqh { z6.h }, p0/Z, [%x[Apanel]]\n"
       ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
       ".inst 0x6465e40b  // bfmmla z11.s, z0.h, z5.h\n"
       ".inst 0x6464e42e  // bfmmla z14.s, z1.h, z4.h\n"
       ".inst 0x6465e431  // bfmmla z17.s, z1.h, z5.h\n"
-      "ld1h { z6.h }, p0/Z, [x22]\n"
+      "ld1h { z7.h }, p0/Z, [x22]\n"
       ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
       ".inst 0x6465e457  // bfmmla z23.s, z2.h, z5.h\n"
-      "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n"
-      ".inst 0x6464e47a  // bfmmla z26.s, z3.h, z4.h\n"
-      ".inst 0x6465e47d  // bfmmla z29.s, z3.h, z5.h\n"
-      "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
-      "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n"
-      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
-      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6467e432  // bfmmla z18.s, z1.h, z7.h\n"
+      "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x6464e4da  // bfmmla z26.s, z6.h, z4.h\n"
+      ".inst 0x6465e4dd  // bfmmla z29.s, z6.h, z5.h\n"
+      "ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6463e40c  // bfmmla z12.s, z0.h, z3.h\n"
+      ".inst 0x6467e42f  // bfmmla z15.s, z1.h, z7.h\n"
+      ".inst 0x6463e432  // bfmmla z18.s, z1.h, z3.h\n"
       "sub x20, x20, #0x2\n"
-      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6467e458  // bfmmla z24.s, z2.h, z7.h\n"
+      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
+      ".inst 0x6463e458  // bfmmla z24.s, z2.h, z3.h\n"
       "cmp x20, #0x2\n"
-      ".inst 0x6466e47b  // bfmmla z27.s, z3.h, z6.h\n"
-      ".inst 0x6467e47e  // bfmmla z30.s, z3.h, z7.h\n"
-      "ld1h { z6.h }, p0/Z, [x22, #4, MUL VL]\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6465e40d  // bfmmla z13.s, z0.h, z5.h\n"
+      ".inst 0x6467e4db  // bfmmla z27.s, z6.h, z7.h\n"
+      ".inst 0x6463e4de  // bfmmla z30.s, z6.h, z3.h\n"
+      "ld1h { z3.h }, p0/Z, [x22, #4, MUL VL]\n"
+      ".inst 0x6465e40a  // bfmmla z10.s, z0.h, z5.h\n"
+      ".inst 0x6464e40d  // bfmmla z13.s, z0.h, z4.h\n"
       "ld1rqh { z0.h }, p0/Z, [%x[Apanel], #16]\n"
-      ".inst 0x6464e430  // bfmmla z16.s, z1.h, z4.h\n"
-      ".inst 0x6465e433  // bfmmla z19.s, z1.h, z5.h\n"
+      ".inst 0x6465e430  // bfmmla z16.s, z1.h, z5.h\n"
+      ".inst 0x6464e433  // bfmmla z19.s, z1.h, z4.h\n"
       "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #32]\n"
-      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
-      ".inst 0x6465e459  // bfmmla z25.s, z2.h, z5.h\n"
+      ".inst 0x6465e456  // bfmmla z22.s, z2.h, z5.h\n"
+      ".inst 0x6464e459  // bfmmla z25.s, z2.h, z4.h\n"
       "ld1h { z7.h }, p0/Z, [x22, #5, MUL VL]\n"
-      ".inst 0x6464e47c  // bfmmla z28.s, z3.h, z4.h\n"
-      ".inst 0x6465e47f  // bfmmla z31.s, z3.h, z5.h\n"
-      "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #48]\n"
-      "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #64]\n"
-      "ld1h { z4.h }, p0/Z, [x22, #6, MUL VL]\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
-      "ld1h { z5.h }, p0/Z, [x22, #7, MUL VL]\n"
+      ".inst 0x6465e4dc  // bfmmla z28.s, z6.h, z5.h\n"
+      ".inst 0x6464e4df  // bfmmla z31.s, z6.h, z4.h\n"
+      "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #48]\n"
+      "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #64]\n"
+      "ld1h { z2.h }, p0/Z, [x22, #6, MUL VL]\n"
+      ".inst 0x6463e408  // bfmmla z8.s, z0.h, z3.h\n"
+      "ld1h { z4.h }, p0/Z, [x22, #7, MUL VL]\n"
       "addvl x22, x22, #16\n"
       ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      ".inst 0x6463e42e  // bfmmla z14.s, z1.h, z3.h\n"
       ".inst 0x6467e431  // bfmmla z17.s, z1.h, z7.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
-      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
-      ".inst 0x6466e47a  // bfmmla z26.s, z3.h, z6.h\n"
-      ".inst 0x6467e47d  // bfmmla z29.s, z3.h, z7.h\n"
-      "ld1h { z6.h }, p0/Z, [x22, #-8, MUL VL]\n"
+      ".inst 0x6463e4b4  // bfmmla z20.s, z5.h, z3.h\n"
+      ".inst 0x6467e4b7  // bfmmla z23.s, z5.h, z7.h\n"
+      ".inst 0x6463e4da  // bfmmla z26.s, z6.h, z3.h\n"
+      ".inst 0x6467e4dd  // bfmmla z29.s, z6.h, z7.h\n"
+      "ld1h { z3.h }, p0/Z, [x22, #-8, MUL VL]\n"
       "ld1h { z7.h }, p0/Z, [x22, #-7, MUL VL]\n"
-      ".inst 0x6464e409  // bfmmla z9.s, z0.h, z4.h\n"
-      ".inst 0x6465e40c  // bfmmla z12.s, z0.h, z5.h\n"
-      ".inst 0x6464e42f  // bfmmla z15.s, z1.h, z4.h\n"
-      ".inst 0x6465e432  // bfmmla z18.s, z1.h, z5.h\n"
-      ".inst 0x6464e455  // bfmmla z21.s, z2.h, z4.h\n"
-      ".inst 0x6465e458  // bfmmla z24.s, z2.h, z5.h\n"
-      ".inst 0x6464e47b  // bfmmla z27.s, z3.h, z4.h\n"
-      ".inst 0x6465e47e  // bfmmla z30.s, z3.h, z5.h\n"
+      ".inst 0x6462e409  // bfmmla z9.s, z0.h, z2.h\n"
+      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6462e42f  // bfmmla z15.s, z1.h, z2.h\n"
+      ".inst 0x6464e432  // bfmmla z18.s, z1.h, z4.h\n"
+      ".inst 0x6462e4b5  // bfmmla z21.s, z5.h, z2.h\n"
+      ".inst 0x6464e4b8  // bfmmla z24.s, z5.h, z4.h\n"
+      ".inst 0x6462e4db  // bfmmla z27.s, z6.h, z2.h\n"
+      ".inst 0x6464e4de  // bfmmla z30.s, z6.h, z4.h\n"
       "ld1h { z4.h }, p0/Z, [x22, #-6, MUL VL]\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      ".inst 0x6463e40a  // bfmmla z10.s, z0.h, z3.h\n"
       ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
       "ld1rqh { z0.h }, p0/Z, [%x[Apanel], #80]\n"
-      ".inst 0x6466e430  // bfmmla z16.s, z1.h, z6.h\n"
+      ".inst 0x6463e430  // bfmmla z16.s, z1.h, z3.h\n"
       ".inst 0x6467e433  // bfmmla z19.s, z1.h, z7.h\n"
       "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #96]\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      ".inst 0x6467e459  // bfmmla z25.s, z2.h, z7.h\n"
+      ".inst 0x6463e4b6  // bfmmla z22.s, z5.h, z3.h\n"
+      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
       "ld1h { z5.h }, p0/Z, [x22, #-5, MUL VL]\n"
-      ".inst 0x6466e47c  // bfmmla z28.s, z3.h, z6.h\n"
-      ".inst 0x6467e47f  // bfmmla z31.s, z3.h, z7.h\n"
+      ".inst 0x6463e4dc  // bfmmla z28.s, z6.h, z3.h\n"
+      ".inst 0x6467e4df  // bfmmla z31.s, z6.h, z7.h\n"
       "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #112]\n"
       "add %x[Apanel], %x[Apanel], #0x80\n"
       "addvl x22, x22, #-4\n"
       "bge 3b\n"
       "4:"  // main loop skip
-      "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
       ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
       ".inst 0x6465e40b  // bfmmla z11.s, z0.h, z5.h\n"
       ".inst 0x6464e42e  // bfmmla z14.s, z1.h, z4.h\n"
@@ -168,114 +172,114 @@
       "ld1h { z6.h }, p0/Z, [x22]\n"
       ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
       ".inst 0x6465e457  // bfmmla z23.s, z2.h, z5.h\n"
-      "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n"
-      ".inst 0x6464e47a  // bfmmla z26.s, z3.h, z4.h\n"
-      ".inst 0x6465e47d  // bfmmla z29.s, z3.h, z5.h\n"
-      "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
-      "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x6464e4fa  // bfmmla z26.s, z7.h, z4.h\n"
+      ".inst 0x6465e4fd  // bfmmla z29.s, z7.h, z5.h\n"
+      "ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n"
       ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
-      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      ".inst 0x6463e40c  // bfmmla z12.s, z0.h, z3.h\n"
       ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
-      ".inst 0x6467e432  // bfmmla z18.s, z1.h, z7.h\n"
+      ".inst 0x6463e432  // bfmmla z18.s, z1.h, z3.h\n"
       "add %x[Apanel], %x[Apanel], #0x10\n"
       ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
-      ".inst 0x6467e458  // bfmmla z24.s, z2.h, z7.h\n"
+      ".inst 0x6463e458  // bfmmla z24.s, z2.h, z3.h\n"
       "addvl x22, x22, #4\n"
-      ".inst 0x6466e47b  // bfmmla z27.s, z3.h, z6.h\n"
-      ".inst 0x6467e47e  // bfmmla z30.s, z3.h, z7.h\n"
-      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
-      ".inst 0x6465e40d  // bfmmla z13.s, z0.h, z5.h\n"
-      ".inst 0x6464e430  // bfmmla z16.s, z1.h, z4.h\n"
-      ".inst 0x6465e433  // bfmmla z19.s, z1.h, z5.h\n"
-      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
-      ".inst 0x6465e459  // bfmmla z25.s, z2.h, z5.h\n"
-      ".inst 0x6464e47c  // bfmmla z28.s, z3.h, z4.h\n"
-      ".inst 0x6465e47f  // bfmmla z31.s, z3.h, z5.h\n"
+      ".inst 0x6466e4fb  // bfmmla z27.s, z7.h, z6.h\n"
+      ".inst 0x6463e4fe  // bfmmla z30.s, z7.h, z3.h\n"
+      ".inst 0x6465e40a  // bfmmla z10.s, z0.h, z5.h\n"
+      ".inst 0x6464e40d  // bfmmla z13.s, z0.h, z4.h\n"
+      ".inst 0x6465e430  // bfmmla z16.s, z1.h, z5.h\n"
+      ".inst 0x6464e433  // bfmmla z19.s, z1.h, z4.h\n"
+      ".inst 0x6465e456  // bfmmla z22.s, z2.h, z5.h\n"
+      ".inst 0x6464e459  // bfmmla z25.s, z2.h, z4.h\n"
+      ".inst 0x6465e4fc  // bfmmla z28.s, z7.h, z5.h\n"
+      ".inst 0x6464e4ff  // bfmmla z31.s, z7.h, z4.h\n"
       "cbz x20, 5f\n"
-      "ld1h { z6.h }, p0/Z, [x22]\n"
-      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
-      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
-      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
-      "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n"
-      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
-      "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
-      "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #48]\n"
-      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
-      ".inst 0x6467e431  // bfmmla z17.s, z1.h, z7.h\n"
-      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
-      "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
-      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
-      ".inst 0x6466e47a  // bfmmla z26.s, z3.h, z6.h\n"
-      "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n"
-      ".inst 0x6467e47d  // bfmmla z29.s, z3.h, z7.h\n"
-      "ld1h { z6.h }, p0/Z, [x22, #4, MUL VL]\n"
-      "ld1h { z7.h }, p0/Z, [x22, #5, MUL VL]\n"
-      ".inst 0x6464e409  // bfmmla z9.s, z0.h, z4.h\n"
-      ".inst 0x6465e40c  // bfmmla z12.s, z0.h, z5.h\n"
+      "ld1h { z1.h }, p0/Z, [x22]\n"
+      "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x6461e4e8  // bfmmla z8.s, z7.h, z1.h\n"
+      "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1h { z0.h }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x6460e4eb  // bfmmla z11.s, z7.h, z0.h\n"
+      "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqh { z4.h }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x6461e4ce  // bfmmla z14.s, z6.h, z1.h\n"
+      ".inst 0x6460e4d1  // bfmmla z17.s, z6.h, z0.h\n"
+      ".inst 0x6461e4b4  // bfmmla z20.s, z5.h, z1.h\n"
+      "ld1h { z3.h }, p0/Z, [x22, #2, MUL VL]\n"
+      ".inst 0x6460e4b7  // bfmmla z23.s, z5.h, z0.h\n"
+      ".inst 0x6461e49a  // bfmmla z26.s, z4.h, z1.h\n"
+      "ld1h { z2.h }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x6460e49d  // bfmmla z29.s, z4.h, z0.h\n"
+      "ld1h { z1.h }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1h { z0.h }, p0/Z, [x22, #5, MUL VL]\n"
+      ".inst 0x6463e4e9  // bfmmla z9.s, z7.h, z3.h\n"
+      ".inst 0x6462e4ec  // bfmmla z12.s, z7.h, z2.h\n"
       "addvl x22, x22, #6\n"
-      ".inst 0x6464e42f  // bfmmla z15.s, z1.h, z4.h\n"
-      ".inst 0x6465e432  // bfmmla z18.s, z1.h, z5.h\n"
+      ".inst 0x6463e4cf  // bfmmla z15.s, z6.h, z3.h\n"
+      ".inst 0x6462e4d2  // bfmmla z18.s, z6.h, z2.h\n"
       "add %x[Apanel], %x[Apanel], #0x40\n"
-      ".inst 0x6464e455  // bfmmla z21.s, z2.h, z4.h\n"
-      ".inst 0x6465e458  // bfmmla z24.s, z2.h, z5.h\n"
-      ".inst 0x6464e47b  // bfmmla z27.s, z3.h, z4.h\n"
-      ".inst 0x6465e47e  // bfmmla z30.s, z3.h, z5.h\n"
-      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
-      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
-      ".inst 0x6466e430  // bfmmla z16.s, z1.h, z6.h\n"
-      ".inst 0x6467e433  // bfmmla z19.s, z1.h, z7.h\n"
-      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
-      ".inst 0x6467e459  // bfmmla z25.s, z2.h, z7.h\n"
-      ".inst 0x6466e47c  // bfmmla z28.s, z3.h, z6.h\n"
-      ".inst 0x6467e47f  // bfmmla z31.s, z3.h, z7.h\n"
+      ".inst 0x6463e4b5  // bfmmla z21.s, z5.h, z3.h\n"
+      ".inst 0x6462e4b8  // bfmmla z24.s, z5.h, z2.h\n"
+      ".inst 0x6463e49b  // bfmmla z27.s, z4.h, z3.h\n"
+      ".inst 0x6462e49e  // bfmmla z30.s, z4.h, z2.h\n"
+      ".inst 0x6461e4ea  // bfmmla z10.s, z7.h, z1.h\n"
+      ".inst 0x6460e4ed  // bfmmla z13.s, z7.h, z0.h\n"
+      ".inst 0x6461e4d0  // bfmmla z16.s, z6.h, z1.h\n"
+      ".inst 0x6460e4d3  // bfmmla z19.s, z6.h, z0.h\n"
+      ".inst 0x6461e4b6  // bfmmla z22.s, z5.h, z1.h\n"
+      ".inst 0x6460e4b9  // bfmmla z25.s, z5.h, z0.h\n"
+      ".inst 0x6461e49c  // bfmmla z28.s, z4.h, z1.h\n"
+      ".inst 0x6460e49f  // bfmmla z31.s, z4.h, z0.h\n"
       "5:"  // multiply loop done
-      "uzp1 z4.d, z8.d, z11.d\n"
+      "uzp1 z0.d, z8.d, z11.d\n"
       "uzp2 z8.d, z8.d, z11.d\n"
-      "st1w { z4.s }, p0, [%x[Cpanel]]\n"
-      "uzp1 z11.d, z9.d, z12.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+      "uzp1 z0.d, z9.d, z12.d\n"
       "uzp2 z9.d, z9.d, z12.d\n"
-      "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
-      "uzp1 z12.d, z10.d, z13.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "uzp1 z0.d, z10.d, z13.d\n"
       "uzp2 z10.d, z10.d, z13.d\n"
-      "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
       "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
-      "uzp1 z13.d, z14.d, z17.d\n"
+      "uzp1 z0.d, z14.d, z17.d\n"
       "uzp2 z14.d, z14.d, z17.d\n"
       "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
-      "uzp1 z17.d, z15.d, z18.d\n"
+      "uzp1 z1.d, z15.d, z18.d\n"
       "subs x23, x23, #0x1\n"
       "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
       "uzp2 z15.d, z15.d, z18.d\n"
-      "uzp1 z18.d, z16.d, z19.d\n"
-      "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "uzp1 z17.d, z16.d, z19.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
       "uzp2 z16.d, z16.d, z19.d\n"
-      "uzp1 z19.d, z20.d, z23.d\n"
-      "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "uzp1 z0.d, z20.d, z23.d\n"
+      "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
       "addvl %x[Cpanel], %x[Cpanel], #16\n"
       "uzp2 z20.d, z20.d, z23.d\n"
-      "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
       "uzp1 z23.d, z21.d, z24.d\n"
       "uzp2 z21.d, z21.d, z24.d\n"
       "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
-      "uzp1 z24.d, z22.d, z25.d\n"
+      "uzp1 z19.d, z22.d, z25.d\n"
       "uzp2 z22.d, z22.d, z25.d\n"
       "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
-      "uzp1 z25.d, z26.d, z29.d\n"
+      "uzp1 z18.d, z26.d, z29.d\n"
       "uzp2 z26.d, z26.d, z29.d\n"
       "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
-      "uzp1 z29.d, z27.d, z30.d\n"
+      "uzp1 z17.d, z27.d, z30.d\n"
       "uzp2 z27.d, z27.d, z30.d\n"
-      "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
-      "uzp1 z30.d, z28.d, z31.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "uzp1 z16.d, z28.d, z31.d\n"
       "uzp2 z28.d, z28.d, z31.d\n"
       "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
-      "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
       "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
       "st1w { z21.s }, p0, [%x[Cpanel]]\n"
       "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
-      "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
-      "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
-      "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
       "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
       "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
       "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
@@ -290,4 +294,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
index 6f1089d..6c54167 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -56,11 +56,6 @@
         return get_vector_length<__fp16>() * 3;
     }
 
-    static unsigned int stripe_width()
-    {
-        return get_vector_length<__fp16>();
-    }
-
     static constexpr unsigned int k_unroll()
     {
         return 1;
@@ -81,6 +76,8 @@
                     return { 13.84, 2.07, 2.52 };
                 case CPUModel::V1:
                     return { 31.90, 5.15, 10.34 };
+                case CPUModel::A64FX:
+                    return { 44.34, 3.23, 7.06 };
             }
         }
 
@@ -104,5 +101,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp
index 9287509..609277d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp
@@ -28,8 +28,12 @@
 namespace arm_gemm {
 
 void sve_interleaved_fp16_mla_8x3VL_a64fx(
-    const __fp16 *Apanel, const __fp16 *Bpanel,
-    __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+    const __fp16 *Apanel,
+    const __fp16 *Bpanel,
+    __fp16 *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -88,7 +92,7 @@
       "fmla z9.h, p0/M, z1.h, z3.h\n"
       "sub x20, x20, #0x2\n"
       "fmla z10.h, p0/M, z2.h, z3.h\n"
-      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
+      "ld1rh { z7.h }, p0/Z, [%x[Apanel], #8]\n"
       "fmla z11.h, p0/M, z0.h, z4.h\n"
       "fmla z12.h, p0/M, z1.h, z4.h\n"
       "fmla z13.h, p0/M, z2.h, z4.h\n"
@@ -97,63 +101,63 @@
       "fmla z15.h, p0/M, z1.h, z5.h\n"
       "cmp x20, #0x2\n"
       "fmla z16.h, p0/M, z2.h, z5.h\n"
-      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #12]\n"
       "fmla z17.h, p0/M, z0.h, z6.h\n"
       "fmla z18.h, p0/M, z1.h, z6.h\n"
       "fmla z19.h, p0/M, z2.h, z6.h\n"
-      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
-      "fmla z20.h, p0/M, z0.h, z3.h\n"
-      "fmla z21.h, p0/M, z1.h, z3.h\n"
-      "fmla z22.h, p0/M, z2.h, z3.h\n"
-      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #14]\n"
+      "fmla z20.h, p0/M, z0.h, z7.h\n"
+      "fmla z21.h, p0/M, z1.h, z7.h\n"
+      "fmla z22.h, p0/M, z2.h, z7.h\n"
+      "ld1rh { z7.h }, p0/Z, [%x[Apanel], #16]\n"
       "fmla z23.h, p0/M, z0.h, z4.h\n"
       "fmla z24.h, p0/M, z1.h, z4.h\n"
       "fmla z25.h, p0/M, z2.h, z4.h\n"
       "ld1rh { z4.h }, p0/Z, [%x[Apanel], #18]\n"
-      "fmla z26.h, p0/M, z0.h, z5.h\n"
-      "fmla z27.h, p0/M, z1.h, z5.h\n"
-      "fmla z28.h, p0/M, z2.h, z5.h\n"
-      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #20]\n"
-      "fmla z29.h, p0/M, z0.h, z6.h\n"
-      "ld1h { z0.h }, p0/Z, [x22, #3, MUL VL]\n"
-      "fmla z30.h, p0/M, z1.h, z6.h\n"
-      "fmla z31.h, p0/M, z2.h, z6.h\n"
-      "ld1h { z1.h }, p0/Z, [x22, #4, MUL VL]\n"
-      "ld1h { z2.h }, p0/Z, [x22, #5, MUL VL]\n"
-      "fmla z8.h, p0/M, z0.h, z3.h\n"
-      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #22]\n"
-      "fmla z9.h, p0/M, z1.h, z3.h\n"
-      "fmla z10.h, p0/M, z2.h, z3.h\n"
-      "fmla z11.h, p0/M, z0.h, z4.h\n"
-      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #24]\n"
-      "fmla z12.h, p0/M, z1.h, z4.h\n"
-      "fmla z13.h, p0/M, z2.h, z4.h\n"
+      "fmla z26.h, p0/M, z0.h, z3.h\n"
+      "fmla z27.h, p0/M, z1.h, z3.h\n"
+      "fmla z28.h, p0/M, z2.h, z3.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z29.h, p0/M, z0.h, z5.h\n"
+      "ld1h { z6.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "fmla z30.h, p0/M, z1.h, z5.h\n"
+      "fmla z31.h, p0/M, z2.h, z5.h\n"
+      "ld1h { z2.h }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1h { z5.h }, p0/Z, [x22, #5, MUL VL]\n"
+      "fmla z8.h, p0/M, z6.h, z7.h\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #22]\n"
+      "fmla z9.h, p0/M, z2.h, z7.h\n"
+      "fmla z10.h, p0/M, z5.h, z7.h\n"
+      "fmla z11.h, p0/M, z6.h, z4.h\n"
+      "ld1rh { z7.h }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z12.h, p0/M, z2.h, z4.h\n"
+      "fmla z13.h, p0/M, z5.h, z4.h\n"
       "ld1rh { z4.h }, p0/Z, [%x[Apanel], #26]\n"
-      "fmla z14.h, p0/M, z0.h, z5.h\n"
-      "fmla z15.h, p0/M, z1.h, z5.h\n"
+      "fmla z14.h, p0/M, z6.h, z3.h\n"
+      "fmla z15.h, p0/M, z2.h, z3.h\n"
       "addvl x22, x22, #6\n"
-      "fmla z16.h, p0/M, z2.h, z5.h\n"
-      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #28]\n"
-      "fmla z17.h, p0/M, z0.h, z6.h\n"
-      "fmla z18.h, p0/M, z1.h, z6.h\n"
-      "fmla z19.h, p0/M, z2.h, z6.h\n"
-      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #30]\n"
+      "fmla z16.h, p0/M, z5.h, z3.h\n"
+      "ld1rh { z0.h }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z17.h, p0/M, z6.h, z1.h\n"
+      "fmla z18.h, p0/M, z2.h, z1.h\n"
+      "fmla z19.h, p0/M, z5.h, z1.h\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #30]\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "fmla z20.h, p0/M, z0.h, z3.h\n"
-      "fmla z21.h, p0/M, z1.h, z3.h\n"
-      "fmla z22.h, p0/M, z2.h, z3.h\n"
-      "fmla z23.h, p0/M, z0.h, z4.h\n"
+      "fmla z20.h, p0/M, z6.h, z7.h\n"
+      "fmla z21.h, p0/M, z2.h, z7.h\n"
+      "fmla z22.h, p0/M, z5.h, z7.h\n"
+      "fmla z23.h, p0/M, z6.h, z4.h\n"
       "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
-      "fmla z24.h, p0/M, z1.h, z4.h\n"
-      "fmla z25.h, p0/M, z2.h, z4.h\n"
+      "fmla z24.h, p0/M, z2.h, z4.h\n"
+      "fmla z25.h, p0/M, z5.h, z4.h\n"
       "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
-      "fmla z26.h, p0/M, z0.h, z5.h\n"
-      "fmla z27.h, p0/M, z1.h, z5.h\n"
-      "fmla z28.h, p0/M, z2.h, z5.h\n"
-      "fmla z29.h, p0/M, z0.h, z6.h\n"
+      "fmla z26.h, p0/M, z6.h, z0.h\n"
+      "fmla z27.h, p0/M, z2.h, z0.h\n"
+      "fmla z28.h, p0/M, z5.h, z0.h\n"
+      "fmla z29.h, p0/M, z6.h, z1.h\n"
       "ld1h { z0.h }, p0/Z, [x22]\n"
-      "fmla z30.h, p0/M, z1.h, z6.h\n"
-      "fmla z31.h, p0/M, z2.h, z6.h\n"
+      "fmla z30.h, p0/M, z2.h, z1.h\n"
+      "fmla z31.h, p0/M, z5.h, z1.h\n"
       "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
       "ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n"
       "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
@@ -164,7 +168,7 @@
       "fmla z9.h, p0/M, z1.h, z3.h\n"
       "addvl x22, x22, #3\n"
       "fmla z10.h, p0/M, z2.h, z3.h\n"
-      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
+      "ld1rh { z7.h }, p0/Z, [%x[Apanel], #8]\n"
       "fmla z11.h, p0/M, z0.h, z4.h\n"
       "fmla z12.h, p0/M, z1.h, z4.h\n"
       "fmla z13.h, p0/M, z2.h, z4.h\n"
@@ -176,58 +180,58 @@
       "fmla z17.h, p0/M, z0.h, z6.h\n"
       "fmla z18.h, p0/M, z1.h, z6.h\n"
       "fmla z19.h, p0/M, z2.h, z6.h\n"
-      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
-      "fmla z20.h, p0/M, z0.h, z3.h\n"
-      "fmla z21.h, p0/M, z1.h, z3.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #14]\n"
+      "fmla z20.h, p0/M, z0.h, z7.h\n"
+      "fmla z21.h, p0/M, z1.h, z7.h\n"
       "add %x[Apanel], %x[Apanel], #0x10\n"
-      "fmla z22.h, p0/M, z2.h, z3.h\n"
+      "fmla z22.h, p0/M, z2.h, z7.h\n"
       "fmla z23.h, p0/M, z0.h, z4.h\n"
       "fmla z24.h, p0/M, z1.h, z4.h\n"
       "fmla z25.h, p0/M, z2.h, z4.h\n"
       "fmla z26.h, p0/M, z0.h, z5.h\n"
       "fmla z27.h, p0/M, z1.h, z5.h\n"
       "fmla z28.h, p0/M, z2.h, z5.h\n"
-      "fmla z29.h, p0/M, z0.h, z6.h\n"
-      "fmla z30.h, p0/M, z1.h, z6.h\n"
-      "fmla z31.h, p0/M, z2.h, z6.h\n"
+      "fmla z29.h, p0/M, z0.h, z3.h\n"
+      "fmla z30.h, p0/M, z1.h, z3.h\n"
+      "fmla z31.h, p0/M, z2.h, z3.h\n"
       "cbz x20, 5f\n"
-      "ld1h { z0.h }, p0/Z, [x22]\n"
-      "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
-      "ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z6.h }, p0/Z, [x22]\n"
+      "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
       "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
-      "fmla z8.h, p0/M, z0.h, z3.h\n"
-      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
-      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
-      "fmla z9.h, p0/M, z1.h, z3.h\n"
-      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
-      "fmla z10.h, p0/M, z2.h, z3.h\n"
-      "fmla z11.h, p0/M, z0.h, z4.h\n"
-      "fmla z12.h, p0/M, z1.h, z4.h\n"
-      "fmla z13.h, p0/M, z2.h, z4.h\n"
+      "fmla z8.h, p0/M, z6.h, z3.h\n"
+      "ld1rh { z2.h }, p0/Z, [%x[Apanel], #2]\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #4]\n"
+      "fmla z9.h, p0/M, z5.h, z3.h\n"
+      "ld1rh { z0.h }, p0/Z, [%x[Apanel], #6]\n"
+      "fmla z10.h, p0/M, z4.h, z3.h\n"
+      "fmla z11.h, p0/M, z6.h, z2.h\n"
+      "fmla z12.h, p0/M, z5.h, z2.h\n"
+      "fmla z13.h, p0/M, z4.h, z2.h\n"
       "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
-      "fmla z14.h, p0/M, z0.h, z5.h\n"
-      "fmla z15.h, p0/M, z1.h, z5.h\n"
-      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n"
-      "fmla z16.h, p0/M, z2.h, z5.h\n"
-      "fmla z17.h, p0/M, z0.h, z6.h\n"
-      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n"
-      "fmla z18.h, p0/M, z1.h, z6.h\n"
-      "fmla z19.h, p0/M, z2.h, z6.h\n"
-      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
-      "fmla z20.h, p0/M, z0.h, z3.h\n"
-      "fmla z21.h, p0/M, z1.h, z3.h\n"
+      "fmla z14.h, p0/M, z6.h, z1.h\n"
+      "fmla z15.h, p0/M, z5.h, z1.h\n"
+      "ld1rh { z2.h }, p0/Z, [%x[Apanel], #10]\n"
+      "fmla z16.h, p0/M, z4.h, z1.h\n"
+      "fmla z17.h, p0/M, z6.h, z0.h\n"
+      "ld1rh { z1.h }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z18.h, p0/M, z5.h, z0.h\n"
+      "fmla z19.h, p0/M, z4.h, z0.h\n"
+      "ld1rh { z0.h }, p0/Z, [%x[Apanel], #14]\n"
+      "fmla z20.h, p0/M, z6.h, z3.h\n"
+      "fmla z21.h, p0/M, z5.h, z3.h\n"
       "addvl x22, x22, #3\n"
-      "fmla z22.h, p0/M, z2.h, z3.h\n"
-      "fmla z23.h, p0/M, z0.h, z4.h\n"
+      "fmla z22.h, p0/M, z4.h, z3.h\n"
+      "fmla z23.h, p0/M, z6.h, z2.h\n"
       "add %x[Apanel], %x[Apanel], #0x10\n"
-      "fmla z24.h, p0/M, z1.h, z4.h\n"
-      "fmla z25.h, p0/M, z2.h, z4.h\n"
-      "fmla z26.h, p0/M, z0.h, z5.h\n"
-      "fmla z27.h, p0/M, z1.h, z5.h\n"
-      "fmla z28.h, p0/M, z2.h, z5.h\n"
-      "fmla z29.h, p0/M, z0.h, z6.h\n"
-      "fmla z30.h, p0/M, z1.h, z6.h\n"
-      "fmla z31.h, p0/M, z2.h, z6.h\n"
+      "fmla z24.h, p0/M, z5.h, z2.h\n"
+      "fmla z25.h, p0/M, z4.h, z2.h\n"
+      "fmla z26.h, p0/M, z6.h, z1.h\n"
+      "fmla z27.h, p0/M, z5.h, z1.h\n"
+      "fmla z28.h, p0/M, z4.h, z1.h\n"
+      "fmla z29.h, p0/M, z6.h, z0.h\n"
+      "fmla z30.h, p0/M, z5.h, z0.h\n"
+      "fmla z31.h, p0/M, z4.h, z0.h\n"
       "5:"  // multiply loop done
       "st1h { z8.h }, p0, [%x[Cpanel]]\n"
       "subs x23, x23, #0x1\n"
@@ -261,7 +265,7 @@
       "bne 1b\n"
       : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
       : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
-      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
index 1ac2ac0..3b16c97 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
@@ -28,8 +28,12 @@
 namespace arm_gemm {
 
 void sve_interleaved_fp16_mla_8x3VL(
-    const __fp16 *Apanel, const __fp16 *Bpanel,
-    __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+    const __fp16 *Apanel,
+    const __fp16 *Bpanel,
+    __fp16 *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -83,16 +87,16 @@
       "3:"  // main loop head
       "fmla z8.h, z2.h, z0.h[0]\n"
       "fmla z11.h, z2.h, z0.h[1]\n"
-      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #16]\n"
       "fmla z14.h, z2.h, z0.h[2]\n"
       "fmla z17.h, z2.h, z0.h[3]\n"
-      "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z6.h }, p0/Z, [x22, #3, MUL VL]\n"
       "fmla z20.h, z2.h, z0.h[4]\n"
       "fmla z23.h, z2.h, z0.h[5]\n"
-      "ld1h { z6.h }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1h { z5.h }, p0/Z, [x22, #4, MUL VL]\n"
       "fmla z26.h, z2.h, z0.h[6]\n"
       "fmla z29.h, z2.h, z0.h[7]\n"
-      "ld1h { z7.h }, p0/Z, [x22, #5, MUL VL]\n"
+      "ld1h { z1.h }, p0/Z, [x22, #5, MUL VL]\n"
       "fmla z9.h, z3.h, z0.h[0]\n"
       "fmla z12.h, z3.h, z0.h[1]\n"
       "addvl x22, x22, #6\n"
@@ -116,31 +120,31 @@
       "fmla z28.h, z4.h, z0.h[6]\n"
       "fmla z31.h, z4.h, z0.h[7]\n"
       "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
-      "fmla z8.h, z5.h, z1.h[0]\n"
-      "fmla z11.h, z5.h, z1.h[1]\n"
+      "fmla z8.h, z6.h, z7.h[0]\n"
+      "fmla z11.h, z6.h, z7.h[1]\n"
       "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n"
-      "fmla z14.h, z5.h, z1.h[2]\n"
-      "fmla z17.h, z5.h, z1.h[3]\n"
-      "fmla z20.h, z5.h, z1.h[4]\n"
-      "fmla z23.h, z5.h, z1.h[5]\n"
-      "fmla z26.h, z5.h, z1.h[6]\n"
-      "fmla z29.h, z5.h, z1.h[7]\n"
-      "fmla z9.h, z6.h, z1.h[0]\n"
-      "fmla z12.h, z6.h, z1.h[1]\n"
-      "fmla z15.h, z6.h, z1.h[2]\n"
-      "fmla z18.h, z6.h, z1.h[3]\n"
-      "fmla z21.h, z6.h, z1.h[4]\n"
-      "fmla z24.h, z6.h, z1.h[5]\n"
-      "fmla z27.h, z6.h, z1.h[6]\n"
-      "fmla z30.h, z6.h, z1.h[7]\n"
-      "fmla z10.h, z7.h, z1.h[0]\n"
-      "fmla z13.h, z7.h, z1.h[1]\n"
-      "fmla z16.h, z7.h, z1.h[2]\n"
-      "fmla z19.h, z7.h, z1.h[3]\n"
-      "fmla z22.h, z7.h, z1.h[4]\n"
-      "fmla z25.h, z7.h, z1.h[5]\n"
-      "fmla z28.h, z7.h, z1.h[6]\n"
-      "fmla z31.h, z7.h, z1.h[7]\n"
+      "fmla z14.h, z6.h, z7.h[2]\n"
+      "fmla z17.h, z6.h, z7.h[3]\n"
+      "fmla z20.h, z6.h, z7.h[4]\n"
+      "fmla z23.h, z6.h, z7.h[5]\n"
+      "fmla z26.h, z6.h, z7.h[6]\n"
+      "fmla z29.h, z6.h, z7.h[7]\n"
+      "fmla z9.h, z5.h, z7.h[0]\n"
+      "fmla z12.h, z5.h, z7.h[1]\n"
+      "fmla z15.h, z5.h, z7.h[2]\n"
+      "fmla z18.h, z5.h, z7.h[3]\n"
+      "fmla z21.h, z5.h, z7.h[4]\n"
+      "fmla z24.h, z5.h, z7.h[5]\n"
+      "fmla z27.h, z5.h, z7.h[6]\n"
+      "fmla z30.h, z5.h, z7.h[7]\n"
+      "fmla z10.h, z1.h, z7.h[0]\n"
+      "fmla z13.h, z1.h, z7.h[1]\n"
+      "fmla z16.h, z1.h, z7.h[2]\n"
+      "fmla z19.h, z1.h, z7.h[3]\n"
+      "fmla z22.h, z1.h, z7.h[4]\n"
+      "fmla z25.h, z1.h, z7.h[5]\n"
+      "fmla z28.h, z1.h, z7.h[6]\n"
+      "fmla z31.h, z1.h, z7.h[7]\n"
       "bge 3b\n"
       "4:"  // main loop skip
       "fmla z8.h, z2.h, z0.h[0]\n"
@@ -170,36 +174,36 @@
       "fmla z28.h, z4.h, z0.h[6]\n"
       "fmla z31.h, z4.h, z0.h[7]\n"
       "cbz x20, 5f\n"
-      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
-      "ld1h { z5.h }, p0/Z, [x22]\n"
-      "fmla z8.h, z5.h, z0.h[0]\n"
-      "ld1h { z6.h }, p0/Z, [x22, #1, MUL VL]\n"
-      "ld1h { z7.h }, p0/Z, [x22, #2, MUL VL]\n"
-      "fmla z11.h, z5.h, z0.h[1]\n"
-      "fmla z14.h, z5.h, z0.h[2]\n"
-      "fmla z17.h, z5.h, z0.h[3]\n"
+      "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "ld1h { z2.h }, p0/Z, [x22]\n"
+      "fmla z8.h, z2.h, z3.h[0]\n"
+      "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z0.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "fmla z11.h, z2.h, z3.h[1]\n"
+      "fmla z14.h, z2.h, z3.h[2]\n"
+      "fmla z17.h, z2.h, z3.h[3]\n"
       "add %x[Apanel], %x[Apanel], #0x10\n"
-      "fmla z20.h, z5.h, z0.h[4]\n"
-      "fmla z23.h, z5.h, z0.h[5]\n"
+      "fmla z20.h, z2.h, z3.h[4]\n"
+      "fmla z23.h, z2.h, z3.h[5]\n"
       "addvl x22, x22, #3\n"
-      "fmla z26.h, z5.h, z0.h[6]\n"
-      "fmla z29.h, z5.h, z0.h[7]\n"
-      "fmla z9.h, z6.h, z0.h[0]\n"
-      "fmla z12.h, z6.h, z0.h[1]\n"
-      "fmla z15.h, z6.h, z0.h[2]\n"
-      "fmla z18.h, z6.h, z0.h[3]\n"
-      "fmla z21.h, z6.h, z0.h[4]\n"
-      "fmla z24.h, z6.h, z0.h[5]\n"
-      "fmla z27.h, z6.h, z0.h[6]\n"
-      "fmla z30.h, z6.h, z0.h[7]\n"
-      "fmla z10.h, z7.h, z0.h[0]\n"
-      "fmla z13.h, z7.h, z0.h[1]\n"
-      "fmla z16.h, z7.h, z0.h[2]\n"
-      "fmla z19.h, z7.h, z0.h[3]\n"
-      "fmla z22.h, z7.h, z0.h[4]\n"
-      "fmla z25.h, z7.h, z0.h[5]\n"
-      "fmla z28.h, z7.h, z0.h[6]\n"
-      "fmla z31.h, z7.h, z0.h[7]\n"
+      "fmla z26.h, z2.h, z3.h[6]\n"
+      "fmla z29.h, z2.h, z3.h[7]\n"
+      "fmla z9.h, z1.h, z3.h[0]\n"
+      "fmla z12.h, z1.h, z3.h[1]\n"
+      "fmla z15.h, z1.h, z3.h[2]\n"
+      "fmla z18.h, z1.h, z3.h[3]\n"
+      "fmla z21.h, z1.h, z3.h[4]\n"
+      "fmla z24.h, z1.h, z3.h[5]\n"
+      "fmla z27.h, z1.h, z3.h[6]\n"
+      "fmla z30.h, z1.h, z3.h[7]\n"
+      "fmla z10.h, z0.h, z3.h[0]\n"
+      "fmla z13.h, z0.h, z3.h[1]\n"
+      "fmla z16.h, z0.h, z3.h[2]\n"
+      "fmla z19.h, z0.h, z3.h[3]\n"
+      "fmla z22.h, z0.h, z3.h[4]\n"
+      "fmla z25.h, z0.h, z3.h[5]\n"
+      "fmla z28.h, z0.h, z3.h[6]\n"
+      "fmla z31.h, z0.h, z3.h[7]\n"
       "5:"  // multiply loop done
       "st1h { z8.h }, p0, [%x[Cpanel]]\n"
       "subs x23, x23, #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
index 29b928e..23ab7ce 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -56,11 +56,6 @@
         return get_vector_length<float>() * 3;
     }
 
-    static unsigned int stripe_width()
-    {
-        return get_vector_length<float>();
-    }
-
     static constexpr unsigned int k_unroll()
     {
         return 1;
@@ -75,10 +70,14 @@
 
         if (std::is_same<T, float>::value) {
             switch (ci->get_cpu_model()) {
-                case CPUModel::V1:
-                    return { 15.15, 9.24, 6.42 };
                 default:
                     return { 7.2307, 3.876, 2.932 };
+                case CPUModel::A64FX:
+                    return { 26.52, 3.42, 4.59 };
+                case CPUModel::A510:
+                    return { 6.25, 3.84, 2.47 };
+                case CPUModel::V1:
+                    return { 15.15, 9.24, 6.42 };
             }
         }
 
@@ -102,5 +101,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp
index 3141a25..0b13913 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp
@@ -28,8 +28,12 @@
 namespace arm_gemm {
 
 void sve_interleaved_fp32_mla_8x3VL_a64fx(
-    const float *Apanel, const float *Bpanel,
-    float *Cpanel, int ablocks, int bblocks, int K) {
+    const float *Apanel,
+    const float *Bpanel,
+    float *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -88,7 +92,7 @@
       "fmla z9.s, p0/M, z1.s, z3.s\n"
       "sub x20, x20, #0x2\n"
       "fmla z10.s, p0/M, z2.s, z3.s\n"
-      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
       "fmla z11.s, p0/M, z0.s, z4.s\n"
       "fmla z12.s, p0/M, z1.s, z4.s\n"
       "fmla z13.s, p0/M, z2.s, z4.s\n"
@@ -97,63 +101,63 @@
       "fmla z15.s, p0/M, z1.s, z5.s\n"
       "cmp x20, #0x2\n"
       "fmla z16.s, p0/M, z2.s, z5.s\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n"
       "fmla z17.s, p0/M, z0.s, z6.s\n"
       "fmla z18.s, p0/M, z1.s, z6.s\n"
       "fmla z19.s, p0/M, z2.s, z6.s\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
-      "fmla z20.s, p0/M, z0.s, z3.s\n"
-      "fmla z21.s, p0/M, z1.s, z3.s\n"
-      "fmla z22.s, p0/M, z2.s, z3.s\n"
-      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z20.s, p0/M, z0.s, z7.s\n"
+      "fmla z21.s, p0/M, z1.s, z7.s\n"
+      "fmla z22.s, p0/M, z2.s, z7.s\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n"
       "fmla z23.s, p0/M, z0.s, z4.s\n"
       "fmla z24.s, p0/M, z1.s, z4.s\n"
       "fmla z25.s, p0/M, z2.s, z4.s\n"
       "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
-      "fmla z26.s, p0/M, z0.s, z5.s\n"
-      "fmla z27.s, p0/M, z1.s, z5.s\n"
-      "fmla z28.s, p0/M, z2.s, z5.s\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n"
-      "fmla z29.s, p0/M, z0.s, z6.s\n"
-      "ld1w { z0.s }, p0/Z, [x22, #3, MUL VL]\n"
-      "fmla z30.s, p0/M, z1.s, z6.s\n"
-      "fmla z31.s, p0/M, z2.s, z6.s\n"
-      "ld1w { z1.s }, p0/Z, [x22, #4, MUL VL]\n"
-      "ld1w { z2.s }, p0/Z, [x22, #5, MUL VL]\n"
-      "fmla z8.s, p0/M, z0.s, z3.s\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n"
-      "fmla z9.s, p0/M, z1.s, z3.s\n"
-      "fmla z10.s, p0/M, z2.s, z3.s\n"
-      "fmla z11.s, p0/M, z0.s, z4.s\n"
-      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
-      "fmla z12.s, p0/M, z1.s, z4.s\n"
-      "fmla z13.s, p0/M, z2.s, z4.s\n"
+      "fmla z26.s, p0/M, z0.s, z3.s\n"
+      "fmla z27.s, p0/M, z1.s, z3.s\n"
+      "fmla z28.s, p0/M, z2.s, z3.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n"
+      "fmla z29.s, p0/M, z0.s, z5.s\n"
+      "ld1w { z6.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "fmla z30.s, p0/M, z1.s, z5.s\n"
+      "fmla z31.s, p0/M, z2.s, z5.s\n"
+      "ld1w { z2.s }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z5.s }, p0/Z, [x22, #5, MUL VL]\n"
+      "fmla z8.s, p0/M, z6.s, z7.s\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
+      "fmla z9.s, p0/M, z2.s, z7.s\n"
+      "fmla z10.s, p0/M, z5.s, z7.s\n"
+      "fmla z11.s, p0/M, z6.s, z4.s\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
+      "fmla z12.s, p0/M, z2.s, z4.s\n"
+      "fmla z13.s, p0/M, z5.s, z4.s\n"
       "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
-      "fmla z14.s, p0/M, z0.s, z5.s\n"
-      "fmla z15.s, p0/M, z1.s, z5.s\n"
+      "fmla z14.s, p0/M, z6.s, z3.s\n"
+      "fmla z15.s, p0/M, z2.s, z3.s\n"
       "addvl x22, x22, #6\n"
-      "fmla z16.s, p0/M, z2.s, z5.s\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n"
-      "fmla z17.s, p0/M, z0.s, z6.s\n"
-      "fmla z18.s, p0/M, z1.s, z6.s\n"
-      "fmla z19.s, p0/M, z2.s, z6.s\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n"
+      "fmla z16.s, p0/M, z5.s, z3.s\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
+      "fmla z17.s, p0/M, z6.s, z1.s\n"
+      "fmla z18.s, p0/M, z2.s, z1.s\n"
+      "fmla z19.s, p0/M, z5.s, z1.s\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
       "add %x[Apanel], %x[Apanel], #0x40\n"
-      "fmla z20.s, p0/M, z0.s, z3.s\n"
-      "fmla z21.s, p0/M, z1.s, z3.s\n"
-      "fmla z22.s, p0/M, z2.s, z3.s\n"
-      "fmla z23.s, p0/M, z0.s, z4.s\n"
+      "fmla z20.s, p0/M, z6.s, z7.s\n"
+      "fmla z21.s, p0/M, z2.s, z7.s\n"
+      "fmla z22.s, p0/M, z5.s, z7.s\n"
+      "fmla z23.s, p0/M, z6.s, z4.s\n"
       "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
-      "fmla z24.s, p0/M, z1.s, z4.s\n"
-      "fmla z25.s, p0/M, z2.s, z4.s\n"
+      "fmla z24.s, p0/M, z2.s, z4.s\n"
+      "fmla z25.s, p0/M, z5.s, z4.s\n"
       "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
-      "fmla z26.s, p0/M, z0.s, z5.s\n"
-      "fmla z27.s, p0/M, z1.s, z5.s\n"
-      "fmla z28.s, p0/M, z2.s, z5.s\n"
-      "fmla z29.s, p0/M, z0.s, z6.s\n"
+      "fmla z26.s, p0/M, z6.s, z0.s\n"
+      "fmla z27.s, p0/M, z2.s, z0.s\n"
+      "fmla z28.s, p0/M, z5.s, z0.s\n"
+      "fmla z29.s, p0/M, z6.s, z1.s\n"
       "ld1w { z0.s }, p0/Z, [x22]\n"
-      "fmla z30.s, p0/M, z1.s, z6.s\n"
-      "fmla z31.s, p0/M, z2.s, z6.s\n"
+      "fmla z30.s, p0/M, z2.s, z1.s\n"
+      "fmla z31.s, p0/M, z5.s, z1.s\n"
       "ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n"
       "ld1w { z2.s }, p0/Z, [x22, #2, MUL VL]\n"
       "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
@@ -164,7 +168,7 @@
       "fmla z9.s, p0/M, z1.s, z3.s\n"
       "addvl x22, x22, #3\n"
       "fmla z10.s, p0/M, z2.s, z3.s\n"
-      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
       "fmla z11.s, p0/M, z0.s, z4.s\n"
       "fmla z12.s, p0/M, z1.s, z4.s\n"
       "fmla z13.s, p0/M, z2.s, z4.s\n"
@@ -176,58 +180,58 @@
       "fmla z17.s, p0/M, z0.s, z6.s\n"
       "fmla z18.s, p0/M, z1.s, z6.s\n"
       "fmla z19.s, p0/M, z2.s, z6.s\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
-      "fmla z20.s, p0/M, z0.s, z3.s\n"
-      "fmla z21.s, p0/M, z1.s, z3.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z20.s, p0/M, z0.s, z7.s\n"
+      "fmla z21.s, p0/M, z1.s, z7.s\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "fmla z22.s, p0/M, z2.s, z3.s\n"
+      "fmla z22.s, p0/M, z2.s, z7.s\n"
       "fmla z23.s, p0/M, z0.s, z4.s\n"
       "fmla z24.s, p0/M, z1.s, z4.s\n"
       "fmla z25.s, p0/M, z2.s, z4.s\n"
       "fmla z26.s, p0/M, z0.s, z5.s\n"
       "fmla z27.s, p0/M, z1.s, z5.s\n"
       "fmla z28.s, p0/M, z2.s, z5.s\n"
-      "fmla z29.s, p0/M, z0.s, z6.s\n"
-      "fmla z30.s, p0/M, z1.s, z6.s\n"
-      "fmla z31.s, p0/M, z2.s, z6.s\n"
+      "fmla z29.s, p0/M, z0.s, z3.s\n"
+      "fmla z30.s, p0/M, z1.s, z3.s\n"
+      "fmla z31.s, p0/M, z2.s, z3.s\n"
       "cbz x20, 5f\n"
-      "ld1w { z0.s }, p0/Z, [x22]\n"
-      "ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z2.s }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z6.s }, p0/Z, [x22]\n"
+      "ld1w { z5.s }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z4.s }, p0/Z, [x22, #2, MUL VL]\n"
       "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
-      "fmla z8.s, p0/M, z0.s, z3.s\n"
-      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
-      "fmla z9.s, p0/M, z1.s, z3.s\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
-      "fmla z10.s, p0/M, z2.s, z3.s\n"
-      "fmla z11.s, p0/M, z0.s, z4.s\n"
-      "fmla z12.s, p0/M, z1.s, z4.s\n"
-      "fmla z13.s, p0/M, z2.s, z4.s\n"
+      "fmla z8.s, p0/M, z6.s, z3.s\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z9.s, p0/M, z5.s, z3.s\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z10.s, p0/M, z4.s, z3.s\n"
+      "fmla z11.s, p0/M, z6.s, z2.s\n"
+      "fmla z12.s, p0/M, z5.s, z2.s\n"
+      "fmla z13.s, p0/M, z4.s, z2.s\n"
       "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
-      "fmla z14.s, p0/M, z0.s, z5.s\n"
-      "fmla z15.s, p0/M, z1.s, z5.s\n"
-      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
-      "fmla z16.s, p0/M, z2.s, z5.s\n"
-      "fmla z17.s, p0/M, z0.s, z6.s\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
-      "fmla z18.s, p0/M, z1.s, z6.s\n"
-      "fmla z19.s, p0/M, z2.s, z6.s\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
-      "fmla z20.s, p0/M, z0.s, z3.s\n"
-      "fmla z21.s, p0/M, z1.s, z3.s\n"
+      "fmla z14.s, p0/M, z6.s, z1.s\n"
+      "fmla z15.s, p0/M, z5.s, z1.s\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z16.s, p0/M, z4.s, z1.s\n"
+      "fmla z17.s, p0/M, z6.s, z0.s\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z18.s, p0/M, z5.s, z0.s\n"
+      "fmla z19.s, p0/M, z4.s, z0.s\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z20.s, p0/M, z6.s, z3.s\n"
+      "fmla z21.s, p0/M, z5.s, z3.s\n"
       "addvl x22, x22, #3\n"
-      "fmla z22.s, p0/M, z2.s, z3.s\n"
-      "fmla z23.s, p0/M, z0.s, z4.s\n"
+      "fmla z22.s, p0/M, z4.s, z3.s\n"
+      "fmla z23.s, p0/M, z6.s, z2.s\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "fmla z24.s, p0/M, z1.s, z4.s\n"
-      "fmla z25.s, p0/M, z2.s, z4.s\n"
-      "fmla z26.s, p0/M, z0.s, z5.s\n"
-      "fmla z27.s, p0/M, z1.s, z5.s\n"
-      "fmla z28.s, p0/M, z2.s, z5.s\n"
-      "fmla z29.s, p0/M, z0.s, z6.s\n"
-      "fmla z30.s, p0/M, z1.s, z6.s\n"
-      "fmla z31.s, p0/M, z2.s, z6.s\n"
+      "fmla z24.s, p0/M, z5.s, z2.s\n"
+      "fmla z25.s, p0/M, z4.s, z2.s\n"
+      "fmla z26.s, p0/M, z6.s, z1.s\n"
+      "fmla z27.s, p0/M, z5.s, z1.s\n"
+      "fmla z28.s, p0/M, z4.s, z1.s\n"
+      "fmla z29.s, p0/M, z6.s, z0.s\n"
+      "fmla z30.s, p0/M, z5.s, z0.s\n"
+      "fmla z31.s, p0/M, z4.s, z0.s\n"
       "5:"  // multiply loop done
       "st1w { z8.s }, p0, [%x[Cpanel]]\n"
       "subs x23, x23, #0x1\n"
@@ -261,7 +265,7 @@
       "bne 1b\n"
       : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
       : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
-      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
index 9d1c0c3..c7f32ff 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
@@ -28,8 +28,12 @@
 namespace arm_gemm {
 
 void sve_interleaved_fp32_mla_8x3VL(
-    const float *Apanel, const float *Bpanel,
-    float *Cpanel, int ablocks, int bblocks, int K) {
+    const float *Apanel,
+    const float *Bpanel,
+    float *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -84,10 +88,10 @@
       "3:"  // main loop head
       "fmla z8.s, z4.s, z0.s[0]\n"
       "fmla z11.s, z4.s, z0.s[1]\n"
-      "ld1rqw { z2.s }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
       "fmla z14.s, z4.s, z0.s[2]\n"
       "fmla z17.s, z4.s, z0.s[3]\n"
-      "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
+      "ld1rqw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
       "fmla z20.s, z4.s, z1.s[0]\n"
       "fmla z23.s, z4.s, z1.s[1]\n"
       "sub x20, x20, #0x2\n"
@@ -114,35 +118,35 @@
       "fmla z25.s, z6.s, z1.s[1]\n"
       "fmla z28.s, z6.s, z1.s[2]\n"
       "fmla z31.s, z6.s, z1.s[3]\n"
-      "ld1w { z6.s }, p0/Z, [x22, #5, MUL VL]\n"
+      "ld1w { z2.s }, p0/Z, [x22, #5, MUL VL]\n"
       "addvl x22, x22, #6\n"
-      "fmla z8.s, z4.s, z2.s[0]\n"
-      "fmla z11.s, z4.s, z2.s[1]\n"
+      "fmla z8.s, z4.s, z3.s[0]\n"
+      "fmla z11.s, z4.s, z3.s[1]\n"
       "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
-      "fmla z14.s, z4.s, z2.s[2]\n"
-      "fmla z17.s, z4.s, z2.s[3]\n"
-      "fmla z20.s, z4.s, z3.s[0]\n"
-      "fmla z23.s, z4.s, z3.s[1]\n"
-      "fmla z26.s, z4.s, z3.s[2]\n"
-      "fmla z29.s, z4.s, z3.s[3]\n"
+      "fmla z14.s, z4.s, z3.s[2]\n"
+      "fmla z17.s, z4.s, z3.s[3]\n"
+      "fmla z20.s, z4.s, z7.s[0]\n"
+      "fmla z23.s, z4.s, z7.s[1]\n"
+      "fmla z26.s, z4.s, z7.s[2]\n"
+      "fmla z29.s, z4.s, z7.s[3]\n"
       "ld1w { z4.s }, p0/Z, [x22]\n"
-      "fmla z9.s, z5.s, z2.s[0]\n"
-      "fmla z12.s, z5.s, z2.s[1]\n"
-      "fmla z15.s, z5.s, z2.s[2]\n"
-      "fmla z18.s, z5.s, z2.s[3]\n"
-      "fmla z21.s, z5.s, z3.s[0]\n"
-      "fmla z24.s, z5.s, z3.s[1]\n"
-      "fmla z27.s, z5.s, z3.s[2]\n"
-      "fmla z30.s, z5.s, z3.s[3]\n"
+      "fmla z9.s, z5.s, z3.s[0]\n"
+      "fmla z12.s, z5.s, z3.s[1]\n"
+      "fmla z15.s, z5.s, z3.s[2]\n"
+      "fmla z18.s, z5.s, z3.s[3]\n"
+      "fmla z21.s, z5.s, z7.s[0]\n"
+      "fmla z24.s, z5.s, z7.s[1]\n"
+      "fmla z27.s, z5.s, z7.s[2]\n"
+      "fmla z30.s, z5.s, z7.s[3]\n"
       "ld1w { z5.s }, p0/Z, [x22, #1, MUL VL]\n"
-      "fmla z10.s, z6.s, z2.s[0]\n"
-      "fmla z13.s, z6.s, z2.s[1]\n"
-      "fmla z16.s, z6.s, z2.s[2]\n"
-      "fmla z19.s, z6.s, z2.s[3]\n"
-      "fmla z22.s, z6.s, z3.s[0]\n"
-      "fmla z25.s, z6.s, z3.s[1]\n"
-      "fmla z28.s, z6.s, z3.s[2]\n"
-      "fmla z31.s, z6.s, z3.s[3]\n"
+      "fmla z10.s, z2.s, z3.s[0]\n"
+      "fmla z13.s, z2.s, z3.s[1]\n"
+      "fmla z16.s, z2.s, z3.s[2]\n"
+      "fmla z19.s, z2.s, z3.s[3]\n"
+      "fmla z22.s, z2.s, z7.s[0]\n"
+      "fmla z25.s, z2.s, z7.s[1]\n"
+      "fmla z28.s, z2.s, z7.s[2]\n"
+      "fmla z31.s, z2.s, z7.s[3]\n"
       "ld1w { z6.s }, p0/Z, [x22, #2, MUL VL]\n"
       "bge 3b\n"
       "4:"  // main loop skip
@@ -173,37 +177,37 @@
       "fmla z28.s, z6.s, z1.s[2]\n"
       "fmla z31.s, z6.s, z1.s[3]\n"
       "cbz x20, 5f\n"
-      "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n"
-      "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rqw { z4.s }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "ld1w { z7.s }, p0/Z, [x22]\n"
-      "ld1w { z4.s }, p0/Z, [x22, #1, MUL VL]\n"
-      "fmla z8.s, z7.s, z0.s[0]\n"
-      "ld1w { z5.s }, p0/Z, [x22, #2, MUL VL]\n"
-      "fmla z11.s, z7.s, z0.s[1]\n"
-      "fmla z14.s, z7.s, z0.s[2]\n"
-      "fmla z17.s, z7.s, z0.s[3]\n"
-      "fmla z20.s, z7.s, z1.s[0]\n"
+      "ld1w { z2.s }, p0/Z, [x22]\n"
+      "ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n"
+      "fmla z8.s, z2.s, z4.s[0]\n"
+      "ld1w { z0.s }, p0/Z, [x22, #2, MUL VL]\n"
+      "fmla z11.s, z2.s, z4.s[1]\n"
+      "fmla z14.s, z2.s, z4.s[2]\n"
+      "fmla z17.s, z2.s, z4.s[3]\n"
+      "fmla z20.s, z2.s, z3.s[0]\n"
       "addvl x22, x22, #3\n"
-      "fmla z23.s, z7.s, z1.s[1]\n"
-      "fmla z26.s, z7.s, z1.s[2]\n"
-      "fmla z29.s, z7.s, z1.s[3]\n"
-      "fmla z9.s, z4.s, z0.s[0]\n"
-      "fmla z12.s, z4.s, z0.s[1]\n"
-      "fmla z15.s, z4.s, z0.s[2]\n"
-      "fmla z18.s, z4.s, z0.s[3]\n"
-      "fmla z21.s, z4.s, z1.s[0]\n"
-      "fmla z24.s, z4.s, z1.s[1]\n"
-      "fmla z27.s, z4.s, z1.s[2]\n"
-      "fmla z30.s, z4.s, z1.s[3]\n"
-      "fmla z10.s, z5.s, z0.s[0]\n"
-      "fmla z13.s, z5.s, z0.s[1]\n"
-      "fmla z16.s, z5.s, z0.s[2]\n"
-      "fmla z19.s, z5.s, z0.s[3]\n"
-      "fmla z22.s, z5.s, z1.s[0]\n"
-      "fmla z25.s, z5.s, z1.s[1]\n"
-      "fmla z28.s, z5.s, z1.s[2]\n"
-      "fmla z31.s, z5.s, z1.s[3]\n"
+      "fmla z23.s, z2.s, z3.s[1]\n"
+      "fmla z26.s, z2.s, z3.s[2]\n"
+      "fmla z29.s, z2.s, z3.s[3]\n"
+      "fmla z9.s, z1.s, z4.s[0]\n"
+      "fmla z12.s, z1.s, z4.s[1]\n"
+      "fmla z15.s, z1.s, z4.s[2]\n"
+      "fmla z18.s, z1.s, z4.s[3]\n"
+      "fmla z21.s, z1.s, z3.s[0]\n"
+      "fmla z24.s, z1.s, z3.s[1]\n"
+      "fmla z27.s, z1.s, z3.s[2]\n"
+      "fmla z30.s, z1.s, z3.s[3]\n"
+      "fmla z10.s, z0.s, z4.s[0]\n"
+      "fmla z13.s, z0.s, z4.s[1]\n"
+      "fmla z16.s, z0.s, z4.s[2]\n"
+      "fmla z19.s, z0.s, z4.s[3]\n"
+      "fmla z22.s, z0.s, z3.s[0]\n"
+      "fmla z25.s, z0.s, z3.s[1]\n"
+      "fmla z28.s, z0.s, z3.s[2]\n"
+      "fmla z31.s, z0.s, z3.s[3]\n"
       "5:"  // multiply loop done
       "st1w { z8.s }, p0, [%x[Cpanel]]\n"
       "subs x23, x23, #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
index 0d707b0..cf3069f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -35,6 +35,7 @@
 {
 // Actual kernel implementations
 void sve_interleaved_s8s32_dot_8x3VL( ARGLIST );
+void sve_interleaved_s8s32_dot_8x3VL_a64fx( ARGLIST );
 
 class cls_sve_interleaved_s8s32_dot_8x3VL
 {
@@ -55,11 +56,6 @@
         return get_vector_length<int32_t>() * 3;
     }
 
-    static unsigned int stripe_width()
-    {
-        return get_vector_length<int32_t>();
-    }
-
     static constexpr unsigned int k_unroll()
     {
         return 4;
@@ -80,6 +76,8 @@
                     return { 63.30, 4.97, 11.35 };
                 case CPUModel::A510:
                     return { 27.42, 3.47, 2.88 };
+                case CPUModel::A64FX:
+                    return { 109.18, 3.88, 7.85 };
             }
         }
 
@@ -92,6 +90,8 @@
                     return { 52.24, 7.49, 0.80 };
                 case CPUModel::A510:
                     return { 27.47, 1.70, 0.28 };
+                case CPUModel::A64FX:
+                    return { 109.92, 2.36, 0.41 };
             }
         }
 
@@ -100,13 +100,19 @@
 
     // Default to the generic kernel
     kern_type kernel=sve_interleaved_s8s32_dot_8x3VL;
-    cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *)
+    cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_interleaved_s8s32_dot_8x3VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp
index a7ca48d..c668a7b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void sve_interleaved_s8s32_dot_8x3VL_a64fx(
-    const int8_t *Apanel, const int8_t *Bpanel,
-    int32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const int8_t *Apanel,
+    const int8_t *Bpanel,
+    int32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -89,7 +93,7 @@
       "sdot z9.s, z1.b, z3.b\n"
       "sub x20, x20, #0x2\n"
       "sdot z10.s, z2.b, z3.b\n"
-      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
       "sdot z11.s, z0.b, z4.b\n"
       "sdot z12.s, z1.b, z4.b\n"
       "sdot z13.s, z2.b, z4.b\n"
@@ -98,63 +102,63 @@
       "sdot z15.s, z1.b, z5.b\n"
       "cmp x20, #0x2\n"
       "sdot z16.s, z2.b, z5.b\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n"
       "sdot z17.s, z0.b, z6.b\n"
       "sdot z18.s, z1.b, z6.b\n"
       "sdot z19.s, z2.b, z6.b\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
-      "sdot z20.s, z0.b, z3.b\n"
-      "sdot z21.s, z1.b, z3.b\n"
-      "sdot z22.s, z2.b, z3.b\n"
-      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n"
+      "sdot z20.s, z0.b, z7.b\n"
+      "sdot z21.s, z1.b, z7.b\n"
+      "sdot z22.s, z2.b, z7.b\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n"
       "sdot z23.s, z0.b, z4.b\n"
       "sdot z24.s, z1.b, z4.b\n"
       "sdot z25.s, z2.b, z4.b\n"
       "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
-      "sdot z26.s, z0.b, z5.b\n"
-      "sdot z27.s, z1.b, z5.b\n"
-      "sdot z28.s, z2.b, z5.b\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n"
-      "sdot z29.s, z0.b, z6.b\n"
-      "ld1b { z0.b }, p0/Z, [x22, #3, MUL VL]\n"
-      "sdot z30.s, z1.b, z6.b\n"
-      "sdot z31.s, z2.b, z6.b\n"
-      "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
-      "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n"
-      "sdot z8.s, z0.b, z3.b\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n"
-      "sdot z9.s, z1.b, z3.b\n"
-      "sdot z10.s, z2.b, z3.b\n"
-      "sdot z11.s, z0.b, z4.b\n"
-      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
-      "sdot z12.s, z1.b, z4.b\n"
-      "sdot z13.s, z2.b, z4.b\n"
+      "sdot z26.s, z0.b, z3.b\n"
+      "sdot z27.s, z1.b, z3.b\n"
+      "sdot z28.s, z2.b, z3.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n"
+      "sdot z29.s, z0.b, z5.b\n"
+      "ld1b { z6.b }, p0/Z, [x22, #3, MUL VL]\n"
+      "sdot z30.s, z1.b, z5.b\n"
+      "sdot z31.s, z2.b, z5.b\n"
+      "ld1b { z2.b }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1b { z5.b }, p0/Z, [x22, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z7.b\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
+      "sdot z9.s, z2.b, z7.b\n"
+      "sdot z10.s, z5.b, z7.b\n"
+      "sdot z11.s, z6.b, z4.b\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
+      "sdot z12.s, z2.b, z4.b\n"
+      "sdot z13.s, z5.b, z4.b\n"
       "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
-      "sdot z14.s, z0.b, z5.b\n"
-      "sdot z15.s, z1.b, z5.b\n"
+      "sdot z14.s, z6.b, z3.b\n"
+      "sdot z15.s, z2.b, z3.b\n"
       "addvl x22, x22, #6\n"
-      "sdot z16.s, z2.b, z5.b\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n"
-      "sdot z17.s, z0.b, z6.b\n"
-      "sdot z18.s, z1.b, z6.b\n"
-      "sdot z19.s, z2.b, z6.b\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n"
+      "sdot z16.s, z5.b, z3.b\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
+      "sdot z17.s, z6.b, z1.b\n"
+      "sdot z18.s, z2.b, z1.b\n"
+      "sdot z19.s, z5.b, z1.b\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
       "add %x[Apanel], %x[Apanel], #0x40\n"
-      "sdot z20.s, z0.b, z3.b\n"
-      "sdot z21.s, z1.b, z3.b\n"
-      "sdot z22.s, z2.b, z3.b\n"
-      "sdot z23.s, z0.b, z4.b\n"
+      "sdot z20.s, z6.b, z7.b\n"
+      "sdot z21.s, z2.b, z7.b\n"
+      "sdot z22.s, z5.b, z7.b\n"
+      "sdot z23.s, z6.b, z4.b\n"
       "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
-      "sdot z24.s, z1.b, z4.b\n"
-      "sdot z25.s, z2.b, z4.b\n"
+      "sdot z24.s, z2.b, z4.b\n"
+      "sdot z25.s, z5.b, z4.b\n"
       "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
-      "sdot z26.s, z0.b, z5.b\n"
-      "sdot z27.s, z1.b, z5.b\n"
-      "sdot z28.s, z2.b, z5.b\n"
-      "sdot z29.s, z0.b, z6.b\n"
+      "sdot z26.s, z6.b, z0.b\n"
+      "sdot z27.s, z2.b, z0.b\n"
+      "sdot z28.s, z5.b, z0.b\n"
+      "sdot z29.s, z6.b, z1.b\n"
       "ld1b { z0.b }, p0/Z, [x22]\n"
-      "sdot z30.s, z1.b, z6.b\n"
-      "sdot z31.s, z2.b, z6.b\n"
+      "sdot z30.s, z2.b, z1.b\n"
+      "sdot z31.s, z5.b, z1.b\n"
       "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
       "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
       "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
@@ -165,7 +169,7 @@
       "sdot z9.s, z1.b, z3.b\n"
       "addvl x22, x22, #3\n"
       "sdot z10.s, z2.b, z3.b\n"
-      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
       "sdot z11.s, z0.b, z4.b\n"
       "sdot z12.s, z1.b, z4.b\n"
       "sdot z13.s, z2.b, z4.b\n"
@@ -177,58 +181,58 @@
       "sdot z17.s, z0.b, z6.b\n"
       "sdot z18.s, z1.b, z6.b\n"
       "sdot z19.s, z2.b, z6.b\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
-      "sdot z20.s, z0.b, z3.b\n"
-      "sdot z21.s, z1.b, z3.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n"
+      "sdot z20.s, z0.b, z7.b\n"
+      "sdot z21.s, z1.b, z7.b\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "sdot z22.s, z2.b, z3.b\n"
+      "sdot z22.s, z2.b, z7.b\n"
       "sdot z23.s, z0.b, z4.b\n"
       "sdot z24.s, z1.b, z4.b\n"
       "sdot z25.s, z2.b, z4.b\n"
       "sdot z26.s, z0.b, z5.b\n"
       "sdot z27.s, z1.b, z5.b\n"
       "sdot z28.s, z2.b, z5.b\n"
-      "sdot z29.s, z0.b, z6.b\n"
-      "sdot z30.s, z1.b, z6.b\n"
-      "sdot z31.s, z2.b, z6.b\n"
+      "sdot z29.s, z0.b, z3.b\n"
+      "sdot z30.s, z1.b, z3.b\n"
+      "sdot z31.s, z2.b, z3.b\n"
       "cbz x20, 5f\n"
-      "ld1b { z0.b }, p0/Z, [x22]\n"
-      "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
-      "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1b { z6.b }, p0/Z, [x22]\n"
+      "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
       "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
-      "sdot z8.s, z0.b, z3.b\n"
-      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
-      "sdot z9.s, z1.b, z3.b\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
-      "sdot z10.s, z2.b, z3.b\n"
-      "sdot z11.s, z0.b, z4.b\n"
-      "sdot z12.s, z1.b, z4.b\n"
-      "sdot z13.s, z2.b, z4.b\n"
+      "sdot z8.s, z6.b, z3.b\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
+      "sdot z9.s, z5.b, z3.b\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
+      "sdot z10.s, z4.b, z3.b\n"
+      "sdot z11.s, z6.b, z2.b\n"
+      "sdot z12.s, z5.b, z2.b\n"
+      "sdot z13.s, z4.b, z2.b\n"
       "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
-      "sdot z14.s, z0.b, z5.b\n"
-      "sdot z15.s, z1.b, z5.b\n"
-      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
-      "sdot z16.s, z2.b, z5.b\n"
-      "sdot z17.s, z0.b, z6.b\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
-      "sdot z18.s, z1.b, z6.b\n"
-      "sdot z19.s, z2.b, z6.b\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
-      "sdot z20.s, z0.b, z3.b\n"
-      "sdot z21.s, z1.b, z3.b\n"
+      "sdot z14.s, z6.b, z1.b\n"
+      "sdot z15.s, z5.b, z1.b\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+      "sdot z16.s, z4.b, z1.b\n"
+      "sdot z17.s, z6.b, z0.b\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
+      "sdot z18.s, z5.b, z0.b\n"
+      "sdot z19.s, z4.b, z0.b\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
+      "sdot z20.s, z6.b, z3.b\n"
+      "sdot z21.s, z5.b, z3.b\n"
       "addvl x22, x22, #3\n"
-      "sdot z22.s, z2.b, z3.b\n"
-      "sdot z23.s, z0.b, z4.b\n"
+      "sdot z22.s, z4.b, z3.b\n"
+      "sdot z23.s, z6.b, z2.b\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "sdot z24.s, z1.b, z4.b\n"
-      "sdot z25.s, z2.b, z4.b\n"
-      "sdot z26.s, z0.b, z5.b\n"
-      "sdot z27.s, z1.b, z5.b\n"
-      "sdot z28.s, z2.b, z5.b\n"
-      "sdot z29.s, z0.b, z6.b\n"
-      "sdot z30.s, z1.b, z6.b\n"
-      "sdot z31.s, z2.b, z6.b\n"
+      "sdot z24.s, z5.b, z2.b\n"
+      "sdot z25.s, z4.b, z2.b\n"
+      "sdot z26.s, z6.b, z1.b\n"
+      "sdot z27.s, z5.b, z1.b\n"
+      "sdot z28.s, z4.b, z1.b\n"
+      "sdot z29.s, z6.b, z0.b\n"
+      "sdot z30.s, z5.b, z0.b\n"
+      "sdot z31.s, z4.b, z0.b\n"
       "5:"  // multiply loop done
       "st1w { z8.s }, p0, [%x[Cpanel]]\n"
       "subs x23, x23, #0x1\n"
@@ -262,7 +266,7 @@
       "bne 1b\n"
       : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
       : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
-      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
index e5f59d2..f6e1a75 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void sve_interleaved_s8s32_dot_8x3VL(
-    const int8_t *Apanel, const int8_t *Bpanel,
-    int32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const int8_t *Apanel,
+    const int8_t *Bpanel,
+    int32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -85,10 +89,10 @@
       "3:"  // main loop head
       "sdot z8.s, z4.b, z0.b[0]\n"
       "sdot z11.s, z4.b, z0.b[1]\n"
-      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #32]\n"
       "sdot z14.s, z4.b, z0.b[2]\n"
       "sdot z17.s, z4.b, z0.b[3]\n"
-      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n"
+      "ld1rqb { z7.b }, p0/Z, [%x[Apanel], #48]\n"
       "sdot z20.s, z4.b, z1.b[0]\n"
       "sdot z23.s, z4.b, z1.b[1]\n"
       "sub x20, x20, #0x2\n"
@@ -115,35 +119,35 @@
       "sdot z25.s, z6.b, z1.b[1]\n"
       "sdot z28.s, z6.b, z1.b[2]\n"
       "sdot z31.s, z6.b, z1.b[3]\n"
-      "ld1b { z6.b }, p0/Z, [x22, #5, MUL VL]\n"
+      "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n"
       "addvl x22, x22, #6\n"
-      "sdot z8.s, z4.b, z2.b[0]\n"
-      "sdot z11.s, z4.b, z2.b[1]\n"
+      "sdot z8.s, z4.b, z3.b[0]\n"
+      "sdot z11.s, z4.b, z3.b[1]\n"
       "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
-      "sdot z14.s, z4.b, z2.b[2]\n"
-      "sdot z17.s, z4.b, z2.b[3]\n"
-      "sdot z20.s, z4.b, z3.b[0]\n"
-      "sdot z23.s, z4.b, z3.b[1]\n"
-      "sdot z26.s, z4.b, z3.b[2]\n"
-      "sdot z29.s, z4.b, z3.b[3]\n"
+      "sdot z14.s, z4.b, z3.b[2]\n"
+      "sdot z17.s, z4.b, z3.b[3]\n"
+      "sdot z20.s, z4.b, z7.b[0]\n"
+      "sdot z23.s, z4.b, z7.b[1]\n"
+      "sdot z26.s, z4.b, z7.b[2]\n"
+      "sdot z29.s, z4.b, z7.b[3]\n"
       "ld1b { z4.b }, p0/Z, [x22]\n"
-      "sdot z9.s, z5.b, z2.b[0]\n"
-      "sdot z12.s, z5.b, z2.b[1]\n"
-      "sdot z15.s, z5.b, z2.b[2]\n"
-      "sdot z18.s, z5.b, z2.b[3]\n"
-      "sdot z21.s, z5.b, z3.b[0]\n"
-      "sdot z24.s, z5.b, z3.b[1]\n"
-      "sdot z27.s, z5.b, z3.b[2]\n"
-      "sdot z30.s, z5.b, z3.b[3]\n"
+      "sdot z9.s, z5.b, z3.b[0]\n"
+      "sdot z12.s, z5.b, z3.b[1]\n"
+      "sdot z15.s, z5.b, z3.b[2]\n"
+      "sdot z18.s, z5.b, z3.b[3]\n"
+      "sdot z21.s, z5.b, z7.b[0]\n"
+      "sdot z24.s, z5.b, z7.b[1]\n"
+      "sdot z27.s, z5.b, z7.b[2]\n"
+      "sdot z30.s, z5.b, z7.b[3]\n"
       "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
-      "sdot z10.s, z6.b, z2.b[0]\n"
-      "sdot z13.s, z6.b, z2.b[1]\n"
-      "sdot z16.s, z6.b, z2.b[2]\n"
-      "sdot z19.s, z6.b, z2.b[3]\n"
-      "sdot z22.s, z6.b, z3.b[0]\n"
-      "sdot z25.s, z6.b, z3.b[1]\n"
-      "sdot z28.s, z6.b, z3.b[2]\n"
-      "sdot z31.s, z6.b, z3.b[3]\n"
+      "sdot z10.s, z2.b, z3.b[0]\n"
+      "sdot z13.s, z2.b, z3.b[1]\n"
+      "sdot z16.s, z2.b, z3.b[2]\n"
+      "sdot z19.s, z2.b, z3.b[3]\n"
+      "sdot z22.s, z2.b, z7.b[0]\n"
+      "sdot z25.s, z2.b, z7.b[1]\n"
+      "sdot z28.s, z2.b, z7.b[2]\n"
+      "sdot z31.s, z2.b, z7.b[3]\n"
       "ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n"
       "bge 3b\n"
       "4:"  // main loop skip
@@ -174,37 +178,37 @@
       "sdot z28.s, z6.b, z1.b[2]\n"
       "sdot z31.s, z6.b, z1.b[3]\n"
       "cbz x20, 5f\n"
-      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
-      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rqb { z4.b }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #16]\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "ld1b { z7.b }, p0/Z, [x22]\n"
-      "ld1b { z4.b }, p0/Z, [x22, #1, MUL VL]\n"
-      "sdot z8.s, z7.b, z0.b[0]\n"
-      "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
-      "sdot z11.s, z7.b, z0.b[1]\n"
-      "sdot z14.s, z7.b, z0.b[2]\n"
-      "sdot z17.s, z7.b, z0.b[3]\n"
-      "sdot z20.s, z7.b, z1.b[0]\n"
+      "ld1b { z2.b }, p0/Z, [x22]\n"
+      "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "sdot z8.s, z2.b, z4.b[0]\n"
+      "ld1b { z0.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "sdot z11.s, z2.b, z4.b[1]\n"
+      "sdot z14.s, z2.b, z4.b[2]\n"
+      "sdot z17.s, z2.b, z4.b[3]\n"
+      "sdot z20.s, z2.b, z3.b[0]\n"
       "addvl x22, x22, #3\n"
-      "sdot z23.s, z7.b, z1.b[1]\n"
-      "sdot z26.s, z7.b, z1.b[2]\n"
-      "sdot z29.s, z7.b, z1.b[3]\n"
-      "sdot z9.s, z4.b, z0.b[0]\n"
-      "sdot z12.s, z4.b, z0.b[1]\n"
-      "sdot z15.s, z4.b, z0.b[2]\n"
-      "sdot z18.s, z4.b, z0.b[3]\n"
-      "sdot z21.s, z4.b, z1.b[0]\n"
-      "sdot z24.s, z4.b, z1.b[1]\n"
-      "sdot z27.s, z4.b, z1.b[2]\n"
-      "sdot z30.s, z4.b, z1.b[3]\n"
-      "sdot z10.s, z5.b, z0.b[0]\n"
-      "sdot z13.s, z5.b, z0.b[1]\n"
-      "sdot z16.s, z5.b, z0.b[2]\n"
-      "sdot z19.s, z5.b, z0.b[3]\n"
-      "sdot z22.s, z5.b, z1.b[0]\n"
-      "sdot z25.s, z5.b, z1.b[1]\n"
-      "sdot z28.s, z5.b, z1.b[2]\n"
-      "sdot z31.s, z5.b, z1.b[3]\n"
+      "sdot z23.s, z2.b, z3.b[1]\n"
+      "sdot z26.s, z2.b, z3.b[2]\n"
+      "sdot z29.s, z2.b, z3.b[3]\n"
+      "sdot z9.s, z1.b, z4.b[0]\n"
+      "sdot z12.s, z1.b, z4.b[1]\n"
+      "sdot z15.s, z1.b, z4.b[2]\n"
+      "sdot z18.s, z1.b, z4.b[3]\n"
+      "sdot z21.s, z1.b, z3.b[0]\n"
+      "sdot z24.s, z1.b, z3.b[1]\n"
+      "sdot z27.s, z1.b, z3.b[2]\n"
+      "sdot z30.s, z1.b, z3.b[3]\n"
+      "sdot z10.s, z0.b, z4.b[0]\n"
+      "sdot z13.s, z0.b, z4.b[1]\n"
+      "sdot z16.s, z0.b, z4.b[2]\n"
+      "sdot z19.s, z0.b, z4.b[3]\n"
+      "sdot z22.s, z0.b, z3.b[0]\n"
+      "sdot z25.s, z0.b, z3.b[1]\n"
+      "sdot z28.s, z0.b, z3.b[2]\n"
+      "sdot z31.s, z0.b, z3.b[3]\n"
       "5:"  // multiply loop done
       "st1w { z8.s }, p0, [%x[Cpanel]]\n"
       "subs x23, x23, #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
index 4e65296..82734ab 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -55,11 +55,6 @@
         return get_vector_length<int32_t>() * 3;
     }
 
-    static unsigned int stripe_width()
-    {
-        return get_vector_length<int32_t>();
-    }
-
     static constexpr unsigned int k_unroll()
     {
         return 8;
@@ -89,7 +84,7 @@
                 default:
                     return { 61.97, 3.64, 0.50 };
                 case CPUModel::V1:
-                    return {  95.28, 7.99, 0.79 };
+                    return { 95.28, 7.99, 0.79 };
                 case CPUModel::A510:
                     return { 43.36, 1.86, 0.28 };
             }
@@ -108,5 +103,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
index 104d5f9..bfed500 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void sve_interleaved_s8s32_mmla_8x3VL(
-    const int8_t *Apanel, const int8_t *Bpanel,
-    int32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const int8_t *Apanel,
+    const int8_t *Bpanel,
+    int32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -85,82 +89,82 @@
       "mov z31.s, #0x0\n"
       "blt 4f\n"
       "3:"  // main loop head
-      "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqb { z6.b }, p0/Z, [%x[Apanel]]\n"
       ".inst 0x45049808  // smmla z8.s, z0.b, z4.b\n"
       ".inst 0x4505980b  // smmla z11.s, z0.b, z5.b\n"
       ".inst 0x4504982e  // smmla z14.s, z1.b, z4.b\n"
       ".inst 0x45059831  // smmla z17.s, z1.b, z5.b\n"
-      "ld1b { z6.b }, p0/Z, [x22]\n"
+      "ld1b { z7.b }, p0/Z, [x22]\n"
       ".inst 0x45049854  // smmla z20.s, z2.b, z4.b\n"
       ".inst 0x45059857  // smmla z23.s, z2.b, z5.b\n"
-      "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
-      ".inst 0x4504987a  // smmla z26.s, z3.b, z4.b\n"
-      ".inst 0x4505987d  // smmla z29.s, z3.b, z5.b\n"
-      "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
-      "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
-      ".inst 0x45069809  // smmla z9.s, z0.b, z6.b\n"
-      ".inst 0x4507980c  // smmla z12.s, z0.b, z7.b\n"
-      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45079832  // smmla z18.s, z1.b, z7.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x450498da  // smmla z26.s, z6.b, z4.b\n"
+      ".inst 0x450598dd  // smmla z29.s, z6.b, z5.b\n"
+      "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x4503980c  // smmla z12.s, z0.b, z3.b\n"
+      ".inst 0x4507982f  // smmla z15.s, z1.b, z7.b\n"
+      ".inst 0x45039832  // smmla z18.s, z1.b, z3.b\n"
       "sub x20, x20, #0x2\n"
-      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
-      ".inst 0x45079858  // smmla z24.s, z2.b, z7.b\n"
+      ".inst 0x45079855  // smmla z21.s, z2.b, z7.b\n"
+      ".inst 0x45039858  // smmla z24.s, z2.b, z3.b\n"
       "cmp x20, #0x2\n"
-      ".inst 0x4506987b  // smmla z27.s, z3.b, z6.b\n"
-      ".inst 0x4507987e  // smmla z30.s, z3.b, z7.b\n"
-      "ld1b { z6.b }, p0/Z, [x22, #4, MUL VL]\n"
-      ".inst 0x4504980a  // smmla z10.s, z0.b, z4.b\n"
-      ".inst 0x4505980d  // smmla z13.s, z0.b, z5.b\n"
+      ".inst 0x450798db  // smmla z27.s, z6.b, z7.b\n"
+      ".inst 0x450398de  // smmla z30.s, z6.b, z3.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #4, MUL VL]\n"
+      ".inst 0x4505980a  // smmla z10.s, z0.b, z5.b\n"
+      ".inst 0x4504980d  // smmla z13.s, z0.b, z4.b\n"
       "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #16]\n"
-      ".inst 0x45049830  // smmla z16.s, z1.b, z4.b\n"
-      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      ".inst 0x45059830  // smmla z16.s, z1.b, z5.b\n"
+      ".inst 0x45049833  // smmla z19.s, z1.b, z4.b\n"
       "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #32]\n"
-      ".inst 0x45049856  // smmla z22.s, z2.b, z4.b\n"
-      ".inst 0x45059859  // smmla z25.s, z2.b, z5.b\n"
+      ".inst 0x45059856  // smmla z22.s, z2.b, z5.b\n"
+      ".inst 0x45049859  // smmla z25.s, z2.b, z4.b\n"
       "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
-      ".inst 0x4504987c  // smmla z28.s, z3.b, z4.b\n"
-      ".inst 0x4505987f  // smmla z31.s, z3.b, z5.b\n"
-      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #48]\n"
-      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #64]\n"
-      "ld1b { z4.b }, p0/Z, [x22, #6, MUL VL]\n"
-      ".inst 0x45069808  // smmla z8.s, z0.b, z6.b\n"
-      "ld1b { z5.b }, p0/Z, [x22, #7, MUL VL]\n"
+      ".inst 0x450598dc  // smmla z28.s, z6.b, z5.b\n"
+      ".inst 0x450498df  // smmla z31.s, z6.b, z4.b\n"
+      "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #48]\n"
+      "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #64]\n"
+      "ld1b { z2.b }, p0/Z, [x22, #6, MUL VL]\n"
+      ".inst 0x45039808  // smmla z8.s, z0.b, z3.b\n"
+      "ld1b { z4.b }, p0/Z, [x22, #7, MUL VL]\n"
       "addvl x22, x22, #16\n"
       ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x4503982e  // smmla z14.s, z1.b, z3.b\n"
       ".inst 0x45079831  // smmla z17.s, z1.b, z7.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
-      ".inst 0x45079857  // smmla z23.s, z2.b, z7.b\n"
-      ".inst 0x4506987a  // smmla z26.s, z3.b, z6.b\n"
-      ".inst 0x4507987d  // smmla z29.s, z3.b, z7.b\n"
-      "ld1b { z6.b }, p0/Z, [x22, #-8, MUL VL]\n"
+      ".inst 0x450398b4  // smmla z20.s, z5.b, z3.b\n"
+      ".inst 0x450798b7  // smmla z23.s, z5.b, z7.b\n"
+      ".inst 0x450398da  // smmla z26.s, z6.b, z3.b\n"
+      ".inst 0x450798dd  // smmla z29.s, z6.b, z7.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #-8, MUL VL]\n"
       "ld1b { z7.b }, p0/Z, [x22, #-7, MUL VL]\n"
-      ".inst 0x45049809  // smmla z9.s, z0.b, z4.b\n"
-      ".inst 0x4505980c  // smmla z12.s, z0.b, z5.b\n"
-      ".inst 0x4504982f  // smmla z15.s, z1.b, z4.b\n"
-      ".inst 0x45059832  // smmla z18.s, z1.b, z5.b\n"
-      ".inst 0x45049855  // smmla z21.s, z2.b, z4.b\n"
-      ".inst 0x45059858  // smmla z24.s, z2.b, z5.b\n"
-      ".inst 0x4504987b  // smmla z27.s, z3.b, z4.b\n"
-      ".inst 0x4505987e  // smmla z30.s, z3.b, z5.b\n"
+      ".inst 0x45029809  // smmla z9.s, z0.b, z2.b\n"
+      ".inst 0x4504980c  // smmla z12.s, z0.b, z4.b\n"
+      ".inst 0x4502982f  // smmla z15.s, z1.b, z2.b\n"
+      ".inst 0x45049832  // smmla z18.s, z1.b, z4.b\n"
+      ".inst 0x450298b5  // smmla z21.s, z5.b, z2.b\n"
+      ".inst 0x450498b8  // smmla z24.s, z5.b, z4.b\n"
+      ".inst 0x450298db  // smmla z27.s, z6.b, z2.b\n"
+      ".inst 0x450498de  // smmla z30.s, z6.b, z4.b\n"
       "ld1b { z4.b }, p0/Z, [x22, #-6, MUL VL]\n"
-      ".inst 0x4506980a  // smmla z10.s, z0.b, z6.b\n"
+      ".inst 0x4503980a  // smmla z10.s, z0.b, z3.b\n"
       ".inst 0x4507980d  // smmla z13.s, z0.b, z7.b\n"
       "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #80]\n"
-      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
+      ".inst 0x45039830  // smmla z16.s, z1.b, z3.b\n"
       ".inst 0x45079833  // smmla z19.s, z1.b, z7.b\n"
       "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #96]\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      ".inst 0x45079859  // smmla z25.s, z2.b, z7.b\n"
+      ".inst 0x450398b6  // smmla z22.s, z5.b, z3.b\n"
+      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
       "ld1b { z5.b }, p0/Z, [x22, #-5, MUL VL]\n"
-      ".inst 0x4506987c  // smmla z28.s, z3.b, z6.b\n"
-      ".inst 0x4507987f  // smmla z31.s, z3.b, z7.b\n"
+      ".inst 0x450398dc  // smmla z28.s, z6.b, z3.b\n"
+      ".inst 0x450798df  // smmla z31.s, z6.b, z7.b\n"
       "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n"
       "add %x[Apanel], %x[Apanel], #0x80\n"
       "addvl x22, x22, #-4\n"
       "bge 3b\n"
       "4:"  // main loop skip
-      "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
       ".inst 0x45049808  // smmla z8.s, z0.b, z4.b\n"
       ".inst 0x4505980b  // smmla z11.s, z0.b, z5.b\n"
       ".inst 0x4504982e  // smmla z14.s, z1.b, z4.b\n"
@@ -168,114 +172,114 @@
       "ld1b { z6.b }, p0/Z, [x22]\n"
       ".inst 0x45049854  // smmla z20.s, z2.b, z4.b\n"
       ".inst 0x45059857  // smmla z23.s, z2.b, z5.b\n"
-      "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
-      ".inst 0x4504987a  // smmla z26.s, z3.b, z4.b\n"
-      ".inst 0x4505987d  // smmla z29.s, z3.b, z5.b\n"
-      "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
-      "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x450498fa  // smmla z26.s, z7.b, z4.b\n"
+      ".inst 0x450598fd  // smmla z29.s, z7.b, z5.b\n"
+      "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
       ".inst 0x45069809  // smmla z9.s, z0.b, z6.b\n"
-      ".inst 0x4507980c  // smmla z12.s, z0.b, z7.b\n"
+      ".inst 0x4503980c  // smmla z12.s, z0.b, z3.b\n"
       ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
-      ".inst 0x45079832  // smmla z18.s, z1.b, z7.b\n"
+      ".inst 0x45039832  // smmla z18.s, z1.b, z3.b\n"
       "add %x[Apanel], %x[Apanel], #0x10\n"
       ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
-      ".inst 0x45079858  // smmla z24.s, z2.b, z7.b\n"
+      ".inst 0x45039858  // smmla z24.s, z2.b, z3.b\n"
       "addvl x22, x22, #4\n"
-      ".inst 0x4506987b  // smmla z27.s, z3.b, z6.b\n"
-      ".inst 0x4507987e  // smmla z30.s, z3.b, z7.b\n"
-      ".inst 0x4504980a  // smmla z10.s, z0.b, z4.b\n"
-      ".inst 0x4505980d  // smmla z13.s, z0.b, z5.b\n"
-      ".inst 0x45049830  // smmla z16.s, z1.b, z4.b\n"
-      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
-      ".inst 0x45049856  // smmla z22.s, z2.b, z4.b\n"
-      ".inst 0x45059859  // smmla z25.s, z2.b, z5.b\n"
-      ".inst 0x4504987c  // smmla z28.s, z3.b, z4.b\n"
-      ".inst 0x4505987f  // smmla z31.s, z3.b, z5.b\n"
+      ".inst 0x450698fb  // smmla z27.s, z7.b, z6.b\n"
+      ".inst 0x450398fe  // smmla z30.s, z7.b, z3.b\n"
+      ".inst 0x4505980a  // smmla z10.s, z0.b, z5.b\n"
+      ".inst 0x4504980d  // smmla z13.s, z0.b, z4.b\n"
+      ".inst 0x45059830  // smmla z16.s, z1.b, z5.b\n"
+      ".inst 0x45049833  // smmla z19.s, z1.b, z4.b\n"
+      ".inst 0x45059856  // smmla z22.s, z2.b, z5.b\n"
+      ".inst 0x45049859  // smmla z25.s, z2.b, z4.b\n"
+      ".inst 0x450598fc  // smmla z28.s, z7.b, z5.b\n"
+      ".inst 0x450498ff  // smmla z31.s, z7.b, z4.b\n"
       "cbz x20, 5f\n"
-      "ld1b { z6.b }, p0/Z, [x22]\n"
-      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
-      ".inst 0x45069808  // smmla z8.s, z0.b, z6.b\n"
-      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
-      "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
-      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
-      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
-      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n"
-      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
-      ".inst 0x45079831  // smmla z17.s, z1.b, z7.b\n"
-      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
-      "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
-      ".inst 0x45079857  // smmla z23.s, z2.b, z7.b\n"
-      ".inst 0x4506987a  // smmla z26.s, z3.b, z6.b\n"
-      "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
-      ".inst 0x4507987d  // smmla z29.s, z3.b, z7.b\n"
-      "ld1b { z6.b }, p0/Z, [x22, #4, MUL VL]\n"
-      "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
-      ".inst 0x45049809  // smmla z9.s, z0.b, z4.b\n"
-      ".inst 0x4505980c  // smmla z12.s, z0.b, z5.b\n"
+      "ld1b { z1.b }, p0/Z, [x22]\n"
+      "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x450198e8  // smmla z8.s, z7.b, z1.b\n"
+      "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1b { z0.b }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x450098eb  // smmla z11.s, z7.b, z0.b\n"
+      "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqb { z4.b }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x450198ce  // smmla z14.s, z6.b, z1.b\n"
+      ".inst 0x450098d1  // smmla z17.s, z6.b, z0.b\n"
+      ".inst 0x450198b4  // smmla z20.s, z5.b, z1.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #2, MUL VL]\n"
+      ".inst 0x450098b7  // smmla z23.s, z5.b, z0.b\n"
+      ".inst 0x4501989a  // smmla z26.s, z4.b, z1.b\n"
+      "ld1b { z2.b }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x4500989d  // smmla z29.s, z4.b, z0.b\n"
+      "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1b { z0.b }, p0/Z, [x22, #5, MUL VL]\n"
+      ".inst 0x450398e9  // smmla z9.s, z7.b, z3.b\n"
+      ".inst 0x450298ec  // smmla z12.s, z7.b, z2.b\n"
       "addvl x22, x22, #6\n"
-      ".inst 0x4504982f  // smmla z15.s, z1.b, z4.b\n"
-      ".inst 0x45059832  // smmla z18.s, z1.b, z5.b\n"
+      ".inst 0x450398cf  // smmla z15.s, z6.b, z3.b\n"
+      ".inst 0x450298d2  // smmla z18.s, z6.b, z2.b\n"
       "add %x[Apanel], %x[Apanel], #0x40\n"
-      ".inst 0x45049855  // smmla z21.s, z2.b, z4.b\n"
-      ".inst 0x45059858  // smmla z24.s, z2.b, z5.b\n"
-      ".inst 0x4504987b  // smmla z27.s, z3.b, z4.b\n"
-      ".inst 0x4505987e  // smmla z30.s, z3.b, z5.b\n"
-      ".inst 0x4506980a  // smmla z10.s, z0.b, z6.b\n"
-      ".inst 0x4507980d  // smmla z13.s, z0.b, z7.b\n"
-      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
-      ".inst 0x45079833  // smmla z19.s, z1.b, z7.b\n"
-      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
-      ".inst 0x45079859  // smmla z25.s, z2.b, z7.b\n"
-      ".inst 0x4506987c  // smmla z28.s, z3.b, z6.b\n"
-      ".inst 0x4507987f  // smmla z31.s, z3.b, z7.b\n"
+      ".inst 0x450398b5  // smmla z21.s, z5.b, z3.b\n"
+      ".inst 0x450298b8  // smmla z24.s, z5.b, z2.b\n"
+      ".inst 0x4503989b  // smmla z27.s, z4.b, z3.b\n"
+      ".inst 0x4502989e  // smmla z30.s, z4.b, z2.b\n"
+      ".inst 0x450198ea  // smmla z10.s, z7.b, z1.b\n"
+      ".inst 0x450098ed  // smmla z13.s, z7.b, z0.b\n"
+      ".inst 0x450198d0  // smmla z16.s, z6.b, z1.b\n"
+      ".inst 0x450098d3  // smmla z19.s, z6.b, z0.b\n"
+      ".inst 0x450198b6  // smmla z22.s, z5.b, z1.b\n"
+      ".inst 0x450098b9  // smmla z25.s, z5.b, z0.b\n"
+      ".inst 0x4501989c  // smmla z28.s, z4.b, z1.b\n"
+      ".inst 0x4500989f  // smmla z31.s, z4.b, z0.b\n"
       "5:"  // multiply loop done
-      "uzp1 z4.d, z8.d, z11.d\n"
+      "uzp1 z0.d, z8.d, z11.d\n"
       "uzp2 z8.d, z8.d, z11.d\n"
-      "st1w { z4.s }, p0, [%x[Cpanel]]\n"
-      "uzp1 z11.d, z9.d, z12.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+      "uzp1 z0.d, z9.d, z12.d\n"
       "uzp2 z9.d, z9.d, z12.d\n"
-      "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
-      "uzp1 z12.d, z10.d, z13.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "uzp1 z0.d, z10.d, z13.d\n"
       "uzp2 z10.d, z10.d, z13.d\n"
-      "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
       "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
-      "uzp1 z13.d, z14.d, z17.d\n"
+      "uzp1 z0.d, z14.d, z17.d\n"
       "uzp2 z14.d, z14.d, z17.d\n"
       "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
-      "uzp1 z17.d, z15.d, z18.d\n"
+      "uzp1 z1.d, z15.d, z18.d\n"
       "subs x23, x23, #0x1\n"
       "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
       "uzp2 z15.d, z15.d, z18.d\n"
-      "uzp1 z18.d, z16.d, z19.d\n"
-      "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "uzp1 z17.d, z16.d, z19.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
       "uzp2 z16.d, z16.d, z19.d\n"
-      "uzp1 z19.d, z20.d, z23.d\n"
-      "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "uzp1 z0.d, z20.d, z23.d\n"
+      "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
       "addvl %x[Cpanel], %x[Cpanel], #16\n"
       "uzp2 z20.d, z20.d, z23.d\n"
-      "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
       "uzp1 z23.d, z21.d, z24.d\n"
       "uzp2 z21.d, z21.d, z24.d\n"
       "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
-      "uzp1 z24.d, z22.d, z25.d\n"
+      "uzp1 z19.d, z22.d, z25.d\n"
       "uzp2 z22.d, z22.d, z25.d\n"
       "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
-      "uzp1 z25.d, z26.d, z29.d\n"
+      "uzp1 z18.d, z26.d, z29.d\n"
       "uzp2 z26.d, z26.d, z29.d\n"
       "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
-      "uzp1 z29.d, z27.d, z30.d\n"
+      "uzp1 z17.d, z27.d, z30.d\n"
       "uzp2 z27.d, z27.d, z30.d\n"
-      "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
-      "uzp1 z30.d, z28.d, z31.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "uzp1 z16.d, z28.d, z31.d\n"
       "uzp2 z28.d, z28.d, z31.d\n"
       "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
-      "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
       "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
       "st1w { z21.s }, p0, [%x[Cpanel]]\n"
       "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
-      "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
-      "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
-      "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
       "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
       "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
       "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
@@ -290,4 +294,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
index 0afcdd2..c0b215c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -35,6 +35,7 @@
 {
 // Actual kernel implementations
 void sve_interleaved_u8u32_dot_8x3VL( ARGLIST );
+void sve_interleaved_u8u32_dot_8x3VL_a64fx( ARGLIST );
 
 class cls_sve_interleaved_u8u32_dot_8x3VL
 {
@@ -55,11 +56,6 @@
         return get_vector_length<uint32_t>() * 3;
     }
 
-    static unsigned int stripe_width()
-    {
-        return get_vector_length<uint32_t>();
-    }
-
     static constexpr unsigned int k_unroll()
     {
         return 4;
@@ -80,6 +76,8 @@
                     return { 27.44, 3.41, 2.90 };
                 case CPUModel::V1:
                     return { 63.30, 4.97, 11.52 };
+                case CPUModel::A64FX:
+                    return { 109.76, 3.88, 6.76 };
             }
         }
 
@@ -92,6 +90,8 @@
                     return { 27.45, 1.65, 0.28 };
                 case CPUModel::V1:
                     return { 52.24, 7.49, 0.80 };
+                case CPUModel::A64FX:
+                    return { 110.18, 2.34, 0.40 };
             }
         }
 
@@ -100,13 +100,19 @@
 
     // Default to the generic kernel
     kern_type kernel=sve_interleaved_u8u32_dot_8x3VL;
-    cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *)
+    cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_interleaved_u8u32_dot_8x3VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp
index 2bfec8f..79e794a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void sve_interleaved_u8u32_dot_8x3VL_a64fx(
-    const uint8_t *Apanel, const uint8_t *Bpanel,
-    uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const uint8_t *Apanel,
+    const uint8_t *Bpanel,
+    uint32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -89,7 +93,7 @@
       "udot z9.s, z1.b, z3.b\n"
       "sub x20, x20, #0x2\n"
       "udot z10.s, z2.b, z3.b\n"
-      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
       "udot z11.s, z0.b, z4.b\n"
       "udot z12.s, z1.b, z4.b\n"
       "udot z13.s, z2.b, z4.b\n"
@@ -98,63 +102,63 @@
       "udot z15.s, z1.b, z5.b\n"
       "cmp x20, #0x2\n"
       "udot z16.s, z2.b, z5.b\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n"
       "udot z17.s, z0.b, z6.b\n"
       "udot z18.s, z1.b, z6.b\n"
       "udot z19.s, z2.b, z6.b\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
-      "udot z20.s, z0.b, z3.b\n"
-      "udot z21.s, z1.b, z3.b\n"
-      "udot z22.s, z2.b, z3.b\n"
-      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n"
+      "udot z20.s, z0.b, z7.b\n"
+      "udot z21.s, z1.b, z7.b\n"
+      "udot z22.s, z2.b, z7.b\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n"
       "udot z23.s, z0.b, z4.b\n"
       "udot z24.s, z1.b, z4.b\n"
       "udot z25.s, z2.b, z4.b\n"
       "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
-      "udot z26.s, z0.b, z5.b\n"
-      "udot z27.s, z1.b, z5.b\n"
-      "udot z28.s, z2.b, z5.b\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n"
-      "udot z29.s, z0.b, z6.b\n"
-      "ld1b { z0.b }, p0/Z, [x22, #3, MUL VL]\n"
-      "udot z30.s, z1.b, z6.b\n"
-      "udot z31.s, z2.b, z6.b\n"
-      "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
-      "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n"
-      "udot z8.s, z0.b, z3.b\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n"
-      "udot z9.s, z1.b, z3.b\n"
-      "udot z10.s, z2.b, z3.b\n"
-      "udot z11.s, z0.b, z4.b\n"
-      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
-      "udot z12.s, z1.b, z4.b\n"
-      "udot z13.s, z2.b, z4.b\n"
+      "udot z26.s, z0.b, z3.b\n"
+      "udot z27.s, z1.b, z3.b\n"
+      "udot z28.s, z2.b, z3.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n"
+      "udot z29.s, z0.b, z5.b\n"
+      "ld1b { z6.b }, p0/Z, [x22, #3, MUL VL]\n"
+      "udot z30.s, z1.b, z5.b\n"
+      "udot z31.s, z2.b, z5.b\n"
+      "ld1b { z2.b }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1b { z5.b }, p0/Z, [x22, #5, MUL VL]\n"
+      "udot z8.s, z6.b, z7.b\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n"
+      "udot z9.s, z2.b, z7.b\n"
+      "udot z10.s, z5.b, z7.b\n"
+      "udot z11.s, z6.b, z4.b\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n"
+      "udot z12.s, z2.b, z4.b\n"
+      "udot z13.s, z5.b, z4.b\n"
       "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
-      "udot z14.s, z0.b, z5.b\n"
-      "udot z15.s, z1.b, z5.b\n"
+      "udot z14.s, z6.b, z3.b\n"
+      "udot z15.s, z2.b, z3.b\n"
       "addvl x22, x22, #6\n"
-      "udot z16.s, z2.b, z5.b\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n"
-      "udot z17.s, z0.b, z6.b\n"
-      "udot z18.s, z1.b, z6.b\n"
-      "udot z19.s, z2.b, z6.b\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n"
+      "udot z16.s, z5.b, z3.b\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n"
+      "udot z17.s, z6.b, z1.b\n"
+      "udot z18.s, z2.b, z1.b\n"
+      "udot z19.s, z5.b, z1.b\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n"
       "add %x[Apanel], %x[Apanel], #0x40\n"
-      "udot z20.s, z0.b, z3.b\n"
-      "udot z21.s, z1.b, z3.b\n"
-      "udot z22.s, z2.b, z3.b\n"
-      "udot z23.s, z0.b, z4.b\n"
+      "udot z20.s, z6.b, z7.b\n"
+      "udot z21.s, z2.b, z7.b\n"
+      "udot z22.s, z5.b, z7.b\n"
+      "udot z23.s, z6.b, z4.b\n"
       "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
-      "udot z24.s, z1.b, z4.b\n"
-      "udot z25.s, z2.b, z4.b\n"
+      "udot z24.s, z2.b, z4.b\n"
+      "udot z25.s, z5.b, z4.b\n"
       "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
-      "udot z26.s, z0.b, z5.b\n"
-      "udot z27.s, z1.b, z5.b\n"
-      "udot z28.s, z2.b, z5.b\n"
-      "udot z29.s, z0.b, z6.b\n"
+      "udot z26.s, z6.b, z0.b\n"
+      "udot z27.s, z2.b, z0.b\n"
+      "udot z28.s, z5.b, z0.b\n"
+      "udot z29.s, z6.b, z1.b\n"
       "ld1b { z0.b }, p0/Z, [x22]\n"
-      "udot z30.s, z1.b, z6.b\n"
-      "udot z31.s, z2.b, z6.b\n"
+      "udot z30.s, z2.b, z1.b\n"
+      "udot z31.s, z5.b, z1.b\n"
       "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
       "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
       "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
@@ -165,7 +169,7 @@
       "udot z9.s, z1.b, z3.b\n"
       "addvl x22, x22, #3\n"
       "udot z10.s, z2.b, z3.b\n"
-      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n"
       "udot z11.s, z0.b, z4.b\n"
       "udot z12.s, z1.b, z4.b\n"
       "udot z13.s, z2.b, z4.b\n"
@@ -177,58 +181,58 @@
       "udot z17.s, z0.b, z6.b\n"
       "udot z18.s, z1.b, z6.b\n"
       "udot z19.s, z2.b, z6.b\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
-      "udot z20.s, z0.b, z3.b\n"
-      "udot z21.s, z1.b, z3.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n"
+      "udot z20.s, z0.b, z7.b\n"
+      "udot z21.s, z1.b, z7.b\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "udot z22.s, z2.b, z3.b\n"
+      "udot z22.s, z2.b, z7.b\n"
       "udot z23.s, z0.b, z4.b\n"
       "udot z24.s, z1.b, z4.b\n"
       "udot z25.s, z2.b, z4.b\n"
       "udot z26.s, z0.b, z5.b\n"
       "udot z27.s, z1.b, z5.b\n"
       "udot z28.s, z2.b, z5.b\n"
-      "udot z29.s, z0.b, z6.b\n"
-      "udot z30.s, z1.b, z6.b\n"
-      "udot z31.s, z2.b, z6.b\n"
+      "udot z29.s, z0.b, z3.b\n"
+      "udot z30.s, z1.b, z3.b\n"
+      "udot z31.s, z2.b, z3.b\n"
       "cbz x20, 5f\n"
-      "ld1b { z0.b }, p0/Z, [x22]\n"
-      "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
-      "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1b { z6.b }, p0/Z, [x22]\n"
+      "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
       "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
-      "udot z8.s, z0.b, z3.b\n"
-      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
-      "udot z9.s, z1.b, z3.b\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
-      "udot z10.s, z2.b, z3.b\n"
-      "udot z11.s, z0.b, z4.b\n"
-      "udot z12.s, z1.b, z4.b\n"
-      "udot z13.s, z2.b, z4.b\n"
+      "udot z8.s, z6.b, z3.b\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n"
+      "udot z9.s, z5.b, z3.b\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n"
+      "udot z10.s, z4.b, z3.b\n"
+      "udot z11.s, z6.b, z2.b\n"
+      "udot z12.s, z5.b, z2.b\n"
+      "udot z13.s, z4.b, z2.b\n"
       "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
-      "udot z14.s, z0.b, z5.b\n"
-      "udot z15.s, z1.b, z5.b\n"
-      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
-      "udot z16.s, z2.b, z5.b\n"
-      "udot z17.s, z0.b, z6.b\n"
-      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
-      "udot z18.s, z1.b, z6.b\n"
-      "udot z19.s, z2.b, z6.b\n"
-      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
-      "udot z20.s, z0.b, z3.b\n"
-      "udot z21.s, z1.b, z3.b\n"
+      "udot z14.s, z6.b, z1.b\n"
+      "udot z15.s, z5.b, z1.b\n"
+      "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n"
+      "udot z16.s, z4.b, z1.b\n"
+      "udot z17.s, z6.b, z0.b\n"
+      "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n"
+      "udot z18.s, z5.b, z0.b\n"
+      "udot z19.s, z4.b, z0.b\n"
+      "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n"
+      "udot z20.s, z6.b, z3.b\n"
+      "udot z21.s, z5.b, z3.b\n"
       "addvl x22, x22, #3\n"
-      "udot z22.s, z2.b, z3.b\n"
-      "udot z23.s, z0.b, z4.b\n"
+      "udot z22.s, z4.b, z3.b\n"
+      "udot z23.s, z6.b, z2.b\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "udot z24.s, z1.b, z4.b\n"
-      "udot z25.s, z2.b, z4.b\n"
-      "udot z26.s, z0.b, z5.b\n"
-      "udot z27.s, z1.b, z5.b\n"
-      "udot z28.s, z2.b, z5.b\n"
-      "udot z29.s, z0.b, z6.b\n"
-      "udot z30.s, z1.b, z6.b\n"
-      "udot z31.s, z2.b, z6.b\n"
+      "udot z24.s, z5.b, z2.b\n"
+      "udot z25.s, z4.b, z2.b\n"
+      "udot z26.s, z6.b, z1.b\n"
+      "udot z27.s, z5.b, z1.b\n"
+      "udot z28.s, z4.b, z1.b\n"
+      "udot z29.s, z6.b, z0.b\n"
+      "udot z30.s, z5.b, z0.b\n"
+      "udot z31.s, z4.b, z0.b\n"
       "5:"  // multiply loop done
       "st1w { z8.s }, p0, [%x[Cpanel]]\n"
       "subs x23, x23, #0x1\n"
@@ -262,7 +266,7 @@
       "bne 1b\n"
       : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
       : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
-      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+      : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
     );
 }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
index 99fff4e..1c88336 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void sve_interleaved_u8u32_dot_8x3VL(
-    const uint8_t *Apanel, const uint8_t *Bpanel,
-    uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const uint8_t *Apanel,
+    const uint8_t *Bpanel,
+    uint32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -85,10 +89,10 @@
       "3:"  // main loop head
       "udot z8.s, z4.b, z0.b[0]\n"
       "udot z11.s, z4.b, z0.b[1]\n"
-      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #32]\n"
       "udot z14.s, z4.b, z0.b[2]\n"
       "udot z17.s, z4.b, z0.b[3]\n"
-      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n"
+      "ld1rqb { z7.b }, p0/Z, [%x[Apanel], #48]\n"
       "udot z20.s, z4.b, z1.b[0]\n"
       "udot z23.s, z4.b, z1.b[1]\n"
       "sub x20, x20, #0x2\n"
@@ -115,35 +119,35 @@
       "udot z25.s, z6.b, z1.b[1]\n"
       "udot z28.s, z6.b, z1.b[2]\n"
       "udot z31.s, z6.b, z1.b[3]\n"
-      "ld1b { z6.b }, p0/Z, [x22, #5, MUL VL]\n"
+      "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n"
       "addvl x22, x22, #6\n"
-      "udot z8.s, z4.b, z2.b[0]\n"
-      "udot z11.s, z4.b, z2.b[1]\n"
+      "udot z8.s, z4.b, z3.b[0]\n"
+      "udot z11.s, z4.b, z3.b[1]\n"
       "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
-      "udot z14.s, z4.b, z2.b[2]\n"
-      "udot z17.s, z4.b, z2.b[3]\n"
-      "udot z20.s, z4.b, z3.b[0]\n"
-      "udot z23.s, z4.b, z3.b[1]\n"
-      "udot z26.s, z4.b, z3.b[2]\n"
-      "udot z29.s, z4.b, z3.b[3]\n"
+      "udot z14.s, z4.b, z3.b[2]\n"
+      "udot z17.s, z4.b, z3.b[3]\n"
+      "udot z20.s, z4.b, z7.b[0]\n"
+      "udot z23.s, z4.b, z7.b[1]\n"
+      "udot z26.s, z4.b, z7.b[2]\n"
+      "udot z29.s, z4.b, z7.b[3]\n"
       "ld1b { z4.b }, p0/Z, [x22]\n"
-      "udot z9.s, z5.b, z2.b[0]\n"
-      "udot z12.s, z5.b, z2.b[1]\n"
-      "udot z15.s, z5.b, z2.b[2]\n"
-      "udot z18.s, z5.b, z2.b[3]\n"
-      "udot z21.s, z5.b, z3.b[0]\n"
-      "udot z24.s, z5.b, z3.b[1]\n"
-      "udot z27.s, z5.b, z3.b[2]\n"
-      "udot z30.s, z5.b, z3.b[3]\n"
+      "udot z9.s, z5.b, z3.b[0]\n"
+      "udot z12.s, z5.b, z3.b[1]\n"
+      "udot z15.s, z5.b, z3.b[2]\n"
+      "udot z18.s, z5.b, z3.b[3]\n"
+      "udot z21.s, z5.b, z7.b[0]\n"
+      "udot z24.s, z5.b, z7.b[1]\n"
+      "udot z27.s, z5.b, z7.b[2]\n"
+      "udot z30.s, z5.b, z7.b[3]\n"
       "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n"
-      "udot z10.s, z6.b, z2.b[0]\n"
-      "udot z13.s, z6.b, z2.b[1]\n"
-      "udot z16.s, z6.b, z2.b[2]\n"
-      "udot z19.s, z6.b, z2.b[3]\n"
-      "udot z22.s, z6.b, z3.b[0]\n"
-      "udot z25.s, z6.b, z3.b[1]\n"
-      "udot z28.s, z6.b, z3.b[2]\n"
-      "udot z31.s, z6.b, z3.b[3]\n"
+      "udot z10.s, z2.b, z3.b[0]\n"
+      "udot z13.s, z2.b, z3.b[1]\n"
+      "udot z16.s, z2.b, z3.b[2]\n"
+      "udot z19.s, z2.b, z3.b[3]\n"
+      "udot z22.s, z2.b, z7.b[0]\n"
+      "udot z25.s, z2.b, z7.b[1]\n"
+      "udot z28.s, z2.b, z7.b[2]\n"
+      "udot z31.s, z2.b, z7.b[3]\n"
       "ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n"
       "bge 3b\n"
       "4:"  // main loop skip
@@ -174,37 +178,37 @@
       "udot z28.s, z6.b, z1.b[2]\n"
       "udot z31.s, z6.b, z1.b[3]\n"
       "cbz x20, 5f\n"
-      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
-      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1rqb { z4.b }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #16]\n"
       "add %x[Apanel], %x[Apanel], #0x20\n"
-      "ld1b { z7.b }, p0/Z, [x22]\n"
-      "ld1b { z4.b }, p0/Z, [x22, #1, MUL VL]\n"
-      "udot z8.s, z7.b, z0.b[0]\n"
-      "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
-      "udot z11.s, z7.b, z0.b[1]\n"
-      "udot z14.s, z7.b, z0.b[2]\n"
-      "udot z17.s, z7.b, z0.b[3]\n"
-      "udot z20.s, z7.b, z1.b[0]\n"
+      "ld1b { z2.b }, p0/Z, [x22]\n"
+      "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "udot z8.s, z2.b, z4.b[0]\n"
+      "ld1b { z0.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "udot z11.s, z2.b, z4.b[1]\n"
+      "udot z14.s, z2.b, z4.b[2]\n"
+      "udot z17.s, z2.b, z4.b[3]\n"
+      "udot z20.s, z2.b, z3.b[0]\n"
       "addvl x22, x22, #3\n"
-      "udot z23.s, z7.b, z1.b[1]\n"
-      "udot z26.s, z7.b, z1.b[2]\n"
-      "udot z29.s, z7.b, z1.b[3]\n"
-      "udot z9.s, z4.b, z0.b[0]\n"
-      "udot z12.s, z4.b, z0.b[1]\n"
-      "udot z15.s, z4.b, z0.b[2]\n"
-      "udot z18.s, z4.b, z0.b[3]\n"
-      "udot z21.s, z4.b, z1.b[0]\n"
-      "udot z24.s, z4.b, z1.b[1]\n"
-      "udot z27.s, z4.b, z1.b[2]\n"
-      "udot z30.s, z4.b, z1.b[3]\n"
-      "udot z10.s, z5.b, z0.b[0]\n"
-      "udot z13.s, z5.b, z0.b[1]\n"
-      "udot z16.s, z5.b, z0.b[2]\n"
-      "udot z19.s, z5.b, z0.b[3]\n"
-      "udot z22.s, z5.b, z1.b[0]\n"
-      "udot z25.s, z5.b, z1.b[1]\n"
-      "udot z28.s, z5.b, z1.b[2]\n"
-      "udot z31.s, z5.b, z1.b[3]\n"
+      "udot z23.s, z2.b, z3.b[1]\n"
+      "udot z26.s, z2.b, z3.b[2]\n"
+      "udot z29.s, z2.b, z3.b[3]\n"
+      "udot z9.s, z1.b, z4.b[0]\n"
+      "udot z12.s, z1.b, z4.b[1]\n"
+      "udot z15.s, z1.b, z4.b[2]\n"
+      "udot z18.s, z1.b, z4.b[3]\n"
+      "udot z21.s, z1.b, z3.b[0]\n"
+      "udot z24.s, z1.b, z3.b[1]\n"
+      "udot z27.s, z1.b, z3.b[2]\n"
+      "udot z30.s, z1.b, z3.b[3]\n"
+      "udot z10.s, z0.b, z4.b[0]\n"
+      "udot z13.s, z0.b, z4.b[1]\n"
+      "udot z16.s, z0.b, z4.b[2]\n"
+      "udot z19.s, z0.b, z4.b[3]\n"
+      "udot z22.s, z0.b, z3.b[0]\n"
+      "udot z25.s, z0.b, z3.b[1]\n"
+      "udot z28.s, z0.b, z3.b[2]\n"
+      "udot z31.s, z0.b, z3.b[3]\n"
       "5:"  // multiply loop done
       "st1w { z8.s }, p0, [%x[Cpanel]]\n"
       "subs x23, x23, #0x1\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
index 58d21d6..067d0bf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,20 +10,20 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #pragma once
-
 #ifdef ARM_COMPUTE_ENABLE_SVE
+
 #include "../std_transforms_sve.hpp"
 #include "../performance_parameters.hpp"
 
@@ -55,11 +55,6 @@
         return get_vector_length<uint32_t>() * 3;
     }
 
-    static unsigned int stripe_width()
-    {
-        return get_vector_length<uint32_t>();
-    }
-
     static constexpr unsigned int k_unroll()
     {
         return 8;
@@ -108,5 +103,4 @@
 } // namespace arm_gemm
 
 #undef ARGLIST
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
index 0b70d03..28449ea 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
@@ -29,8 +29,12 @@
 namespace arm_gemm {
 
 void sve_interleaved_u8u32_mmla_8x3VL(
-    const uint8_t *Apanel, const uint8_t *Bpanel,
-    uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const uint8_t *Apanel,
+    const uint8_t *Bpanel,
+    uint32_t *Cpanel,
+    int ablocks,
+    int bblocks,
+    int K) {
 
     struct KernelArgs {
         size_t K = {};
@@ -85,82 +89,82 @@
       "mov z31.s, #0x0\n"
       "blt 4f\n"
       "3:"  // main loop head
-      "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqb { z6.b }, p0/Z, [%x[Apanel]]\n"
       ".inst 0x45c49808  // ummla z8.s, z0.b, z4.b\n"
       ".inst 0x45c5980b  // ummla z11.s, z0.b, z5.b\n"
       ".inst 0x45c4982e  // ummla z14.s, z1.b, z4.b\n"
       ".inst 0x45c59831  // ummla z17.s, z1.b, z5.b\n"
-      "ld1b { z6.b }, p0/Z, [x22]\n"
+      "ld1b { z7.b }, p0/Z, [x22]\n"
       ".inst 0x45c49854  // ummla z20.s, z2.b, z4.b\n"
       ".inst 0x45c59857  // ummla z23.s, z2.b, z5.b\n"
-      "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
-      ".inst 0x45c4987a  // ummla z26.s, z3.b, z4.b\n"
-      ".inst 0x45c5987d  // ummla z29.s, z3.b, z5.b\n"
-      "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
-      "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
-      ".inst 0x45c69809  // ummla z9.s, z0.b, z6.b\n"
-      ".inst 0x45c7980c  // ummla z12.s, z0.b, z7.b\n"
-      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
-      ".inst 0x45c79832  // ummla z18.s, z1.b, z7.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x45c498da  // ummla z26.s, z6.b, z4.b\n"
+      ".inst 0x45c598dd  // ummla z29.s, z6.b, z5.b\n"
+      "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      ".inst 0x45c3980c  // ummla z12.s, z0.b, z3.b\n"
+      ".inst 0x45c7982f  // ummla z15.s, z1.b, z7.b\n"
+      ".inst 0x45c39832  // ummla z18.s, z1.b, z3.b\n"
       "sub x20, x20, #0x2\n"
-      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
-      ".inst 0x45c79858  // ummla z24.s, z2.b, z7.b\n"
+      ".inst 0x45c79855  // ummla z21.s, z2.b, z7.b\n"
+      ".inst 0x45c39858  // ummla z24.s, z2.b, z3.b\n"
       "cmp x20, #0x2\n"
-      ".inst 0x45c6987b  // ummla z27.s, z3.b, z6.b\n"
-      ".inst 0x45c7987e  // ummla z30.s, z3.b, z7.b\n"
-      "ld1b { z6.b }, p0/Z, [x22, #4, MUL VL]\n"
-      ".inst 0x45c4980a  // ummla z10.s, z0.b, z4.b\n"
-      ".inst 0x45c5980d  // ummla z13.s, z0.b, z5.b\n"
+      ".inst 0x45c798db  // ummla z27.s, z6.b, z7.b\n"
+      ".inst 0x45c398de  // ummla z30.s, z6.b, z3.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #4, MUL VL]\n"
+      ".inst 0x45c5980a  // ummla z10.s, z0.b, z5.b\n"
+      ".inst 0x45c4980d  // ummla z13.s, z0.b, z4.b\n"
       "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #16]\n"
-      ".inst 0x45c49830  // ummla z16.s, z1.b, z4.b\n"
-      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      ".inst 0x45c59830  // ummla z16.s, z1.b, z5.b\n"
+      ".inst 0x45c49833  // ummla z19.s, z1.b, z4.b\n"
       "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #32]\n"
-      ".inst 0x45c49856  // ummla z22.s, z2.b, z4.b\n"
-      ".inst 0x45c59859  // ummla z25.s, z2.b, z5.b\n"
+      ".inst 0x45c59856  // ummla z22.s, z2.b, z5.b\n"
+      ".inst 0x45c49859  // ummla z25.s, z2.b, z4.b\n"
       "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
-      ".inst 0x45c4987c  // ummla z28.s, z3.b, z4.b\n"
-      ".inst 0x45c5987f  // ummla z31.s, z3.b, z5.b\n"
-      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #48]\n"
-      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #64]\n"
-      "ld1b { z4.b }, p0/Z, [x22, #6, MUL VL]\n"
-      ".inst 0x45c69808  // ummla z8.s, z0.b, z6.b\n"
-      "ld1b { z5.b }, p0/Z, [x22, #7, MUL VL]\n"
+      ".inst 0x45c598dc  // ummla z28.s, z6.b, z5.b\n"
+      ".inst 0x45c498df  // ummla z31.s, z6.b, z4.b\n"
+      "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #48]\n"
+      "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #64]\n"
+      "ld1b { z2.b }, p0/Z, [x22, #6, MUL VL]\n"
+      ".inst 0x45c39808  // ummla z8.s, z0.b, z3.b\n"
+      "ld1b { z4.b }, p0/Z, [x22, #7, MUL VL]\n"
       "addvl x22, x22, #16\n"
       ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
-      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      ".inst 0x45c3982e  // ummla z14.s, z1.b, z3.b\n"
       ".inst 0x45c79831  // ummla z17.s, z1.b, z7.b\n"
-      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
-      ".inst 0x45c79857  // ummla z23.s, z2.b, z7.b\n"
-      ".inst 0x45c6987a  // ummla z26.s, z3.b, z6.b\n"
-      ".inst 0x45c7987d  // ummla z29.s, z3.b, z7.b\n"
-      "ld1b { z6.b }, p0/Z, [x22, #-8, MUL VL]\n"
+      ".inst 0x45c398b4  // ummla z20.s, z5.b, z3.b\n"
+      ".inst 0x45c798b7  // ummla z23.s, z5.b, z7.b\n"
+      ".inst 0x45c398da  // ummla z26.s, z6.b, z3.b\n"
+      ".inst 0x45c798dd  // ummla z29.s, z6.b, z7.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #-8, MUL VL]\n"
       "ld1b { z7.b }, p0/Z, [x22, #-7, MUL VL]\n"
-      ".inst 0x45c49809  // ummla z9.s, z0.b, z4.b\n"
-      ".inst 0x45c5980c  // ummla z12.s, z0.b, z5.b\n"
-      ".inst 0x45c4982f  // ummla z15.s, z1.b, z4.b\n"
-      ".inst 0x45c59832  // ummla z18.s, z1.b, z5.b\n"
-      ".inst 0x45c49855  // ummla z21.s, z2.b, z4.b\n"
-      ".inst 0x45c59858  // ummla z24.s, z2.b, z5.b\n"
-      ".inst 0x45c4987b  // ummla z27.s, z3.b, z4.b\n"
-      ".inst 0x45c5987e  // ummla z30.s, z3.b, z5.b\n"
+      ".inst 0x45c29809  // ummla z9.s, z0.b, z2.b\n"
+      ".inst 0x45c4980c  // ummla z12.s, z0.b, z4.b\n"
+      ".inst 0x45c2982f  // ummla z15.s, z1.b, z2.b\n"
+      ".inst 0x45c49832  // ummla z18.s, z1.b, z4.b\n"
+      ".inst 0x45c298b5  // ummla z21.s, z5.b, z2.b\n"
+      ".inst 0x45c498b8  // ummla z24.s, z5.b, z4.b\n"
+      ".inst 0x45c298db  // ummla z27.s, z6.b, z2.b\n"
+      ".inst 0x45c498de  // ummla z30.s, z6.b, z4.b\n"
       "ld1b { z4.b }, p0/Z, [x22, #-6, MUL VL]\n"
-      ".inst 0x45c6980a  // ummla z10.s, z0.b, z6.b\n"
+      ".inst 0x45c3980a  // ummla z10.s, z0.b, z3.b\n"
       ".inst 0x45c7980d  // ummla z13.s, z0.b, z7.b\n"
       "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #80]\n"
-      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
+      ".inst 0x45c39830  // ummla z16.s, z1.b, z3.b\n"
       ".inst 0x45c79833  // ummla z19.s, z1.b, z7.b\n"
       "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #96]\n"
-      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
-      ".inst 0x45c79859  // ummla z25.s, z2.b, z7.b\n"
+      ".inst 0x45c398b6  // ummla z22.s, z5.b, z3.b\n"
+      ".inst 0x45c798b9  // ummla z25.s, z5.b, z7.b\n"
       "ld1b { z5.b }, p0/Z, [x22, #-5, MUL VL]\n"
-      ".inst 0x45c6987c  // ummla z28.s, z3.b, z6.b\n"
-      ".inst 0x45c7987f  // ummla z31.s, z3.b, z7.b\n"
+      ".inst 0x45c398dc  // ummla z28.s, z6.b, z3.b\n"
+      ".inst 0x45c798df  // ummla z31.s, z6.b, z7.b\n"
       "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n"
       "add %x[Apanel], %x[Apanel], #0x80\n"
       "addvl x22, x22, #-4\n"
       "bge 3b\n"
       "4:"  // main loop skip
-      "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
       ".inst 0x45c49808  // ummla z8.s, z0.b, z4.b\n"
       ".inst 0x45c5980b  // ummla z11.s, z0.b, z5.b\n"
       ".inst 0x45c4982e  // ummla z14.s, z1.b, z4.b\n"
@@ -168,114 +172,114 @@
       "ld1b { z6.b }, p0/Z, [x22]\n"
       ".inst 0x45c49854  // ummla z20.s, z2.b, z4.b\n"
       ".inst 0x45c59857  // ummla z23.s, z2.b, z5.b\n"
-      "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
-      ".inst 0x45c4987a  // ummla z26.s, z3.b, z4.b\n"
-      ".inst 0x45c5987d  // ummla z29.s, z3.b, z5.b\n"
-      "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
-      "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x45c498fa  // ummla z26.s, z7.b, z4.b\n"
+      ".inst 0x45c598fd  // ummla z29.s, z7.b, z5.b\n"
+      "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n"
+      "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n"
       ".inst 0x45c69809  // ummla z9.s, z0.b, z6.b\n"
-      ".inst 0x45c7980c  // ummla z12.s, z0.b, z7.b\n"
+      ".inst 0x45c3980c  // ummla z12.s, z0.b, z3.b\n"
       ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
-      ".inst 0x45c79832  // ummla z18.s, z1.b, z7.b\n"
+      ".inst 0x45c39832  // ummla z18.s, z1.b, z3.b\n"
       "add %x[Apanel], %x[Apanel], #0x10\n"
       ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
-      ".inst 0x45c79858  // ummla z24.s, z2.b, z7.b\n"
+      ".inst 0x45c39858  // ummla z24.s, z2.b, z3.b\n"
       "addvl x22, x22, #4\n"
-      ".inst 0x45c6987b  // ummla z27.s, z3.b, z6.b\n"
-      ".inst 0x45c7987e  // ummla z30.s, z3.b, z7.b\n"
-      ".inst 0x45c4980a  // ummla z10.s, z0.b, z4.b\n"
-      ".inst 0x45c5980d  // ummla z13.s, z0.b, z5.b\n"
-      ".inst 0x45c49830  // ummla z16.s, z1.b, z4.b\n"
-      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
-      ".inst 0x45c49856  // ummla z22.s, z2.b, z4.b\n"
-      ".inst 0x45c59859  // ummla z25.s, z2.b, z5.b\n"
-      ".inst 0x45c4987c  // ummla z28.s, z3.b, z4.b\n"
-      ".inst 0x45c5987f  // ummla z31.s, z3.b, z5.b\n"
+      ".inst 0x45c698fb  // ummla z27.s, z7.b, z6.b\n"
+      ".inst 0x45c398fe  // ummla z30.s, z7.b, z3.b\n"
+      ".inst 0x45c5980a  // ummla z10.s, z0.b, z5.b\n"
+      ".inst 0x45c4980d  // ummla z13.s, z0.b, z4.b\n"
+      ".inst 0x45c59830  // ummla z16.s, z1.b, z5.b\n"
+      ".inst 0x45c49833  // ummla z19.s, z1.b, z4.b\n"
+      ".inst 0x45c59856  // ummla z22.s, z2.b, z5.b\n"
+      ".inst 0x45c49859  // ummla z25.s, z2.b, z4.b\n"
+      ".inst 0x45c598fc  // ummla z28.s, z7.b, z5.b\n"
+      ".inst 0x45c498ff  // ummla z31.s, z7.b, z4.b\n"
       "cbz x20, 5f\n"
-      "ld1b { z6.b }, p0/Z, [x22]\n"
-      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
-      ".inst 0x45c69808  // ummla z8.s, z0.b, z6.b\n"
-      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
-      "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n"
-      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
-      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
-      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n"
-      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
-      ".inst 0x45c79831  // ummla z17.s, z1.b, z7.b\n"
-      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
-      "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n"
-      ".inst 0x45c79857  // ummla z23.s, z2.b, z7.b\n"
-      ".inst 0x45c6987a  // ummla z26.s, z3.b, z6.b\n"
-      "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n"
-      ".inst 0x45c7987d  // ummla z29.s, z3.b, z7.b\n"
-      "ld1b { z6.b }, p0/Z, [x22, #4, MUL VL]\n"
-      "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n"
-      ".inst 0x45c49809  // ummla z9.s, z0.b, z4.b\n"
-      ".inst 0x45c5980c  // ummla z12.s, z0.b, z5.b\n"
+      "ld1b { z1.b }, p0/Z, [x22]\n"
+      "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x45c198e8  // ummla z8.s, z7.b, z1.b\n"
+      "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1b { z0.b }, p0/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x45c098eb  // ummla z11.s, z7.b, z0.b\n"
+      "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqb { z4.b }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x45c198ce  // ummla z14.s, z6.b, z1.b\n"
+      ".inst 0x45c098d1  // ummla z17.s, z6.b, z0.b\n"
+      ".inst 0x45c198b4  // ummla z20.s, z5.b, z1.b\n"
+      "ld1b { z3.b }, p0/Z, [x22, #2, MUL VL]\n"
+      ".inst 0x45c098b7  // ummla z23.s, z5.b, z0.b\n"
+      ".inst 0x45c1989a  // ummla z26.s, z4.b, z1.b\n"
+      "ld1b { z2.b }, p0/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x45c0989d  // ummla z29.s, z4.b, z0.b\n"
+      "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n"
+      "ld1b { z0.b }, p0/Z, [x22, #5, MUL VL]\n"
+      ".inst 0x45c398e9  // ummla z9.s, z7.b, z3.b\n"
+      ".inst 0x45c298ec  // ummla z12.s, z7.b, z2.b\n"
       "addvl x22, x22, #6\n"
-      ".inst 0x45c4982f  // ummla z15.s, z1.b, z4.b\n"
-      ".inst 0x45c59832  // ummla z18.s, z1.b, z5.b\n"
+      ".inst 0x45c398cf  // ummla z15.s, z6.b, z3.b\n"
+      ".inst 0x45c298d2  // ummla z18.s, z6.b, z2.b\n"
       "add %x[Apanel], %x[Apanel], #0x40\n"
-      ".inst 0x45c49855  // ummla z21.s, z2.b, z4.b\n"
-      ".inst 0x45c59858  // ummla z24.s, z2.b, z5.b\n"
-      ".inst 0x45c4987b  // ummla z27.s, z3.b, z4.b\n"
-      ".inst 0x45c5987e  // ummla z30.s, z3.b, z5.b\n"
-      ".inst 0x45c6980a  // ummla z10.s, z0.b, z6.b\n"
-      ".inst 0x45c7980d  // ummla z13.s, z0.b, z7.b\n"
-      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
-      ".inst 0x45c79833  // ummla z19.s, z1.b, z7.b\n"
-      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
-      ".inst 0x45c79859  // ummla z25.s, z2.b, z7.b\n"
-      ".inst 0x45c6987c  // ummla z28.s, z3.b, z6.b\n"
-      ".inst 0x45c7987f  // ummla z31.s, z3.b, z7.b\n"
+      ".inst 0x45c398b5  // ummla z21.s, z5.b, z3.b\n"
+      ".inst 0x45c298b8  // ummla z24.s, z5.b, z2.b\n"
+      ".inst 0x45c3989b  // ummla z27.s, z4.b, z3.b\n"
+      ".inst 0x45c2989e  // ummla z30.s, z4.b, z2.b\n"
+      ".inst 0x45c198ea  // ummla z10.s, z7.b, z1.b\n"
+      ".inst 0x45c098ed  // ummla z13.s, z7.b, z0.b\n"
+      ".inst 0x45c198d0  // ummla z16.s, z6.b, z1.b\n"
+      ".inst 0x45c098d3  // ummla z19.s, z6.b, z0.b\n"
+      ".inst 0x45c198b6  // ummla z22.s, z5.b, z1.b\n"
+      ".inst 0x45c098b9  // ummla z25.s, z5.b, z0.b\n"
+      ".inst 0x45c1989c  // ummla z28.s, z4.b, z1.b\n"
+      ".inst 0x45c0989f  // ummla z31.s, z4.b, z0.b\n"
       "5:"  // multiply loop done
-      "uzp1 z4.d, z8.d, z11.d\n"
+      "uzp1 z0.d, z8.d, z11.d\n"
       "uzp2 z8.d, z8.d, z11.d\n"
-      "st1w { z4.s }, p0, [%x[Cpanel]]\n"
-      "uzp1 z11.d, z9.d, z12.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel]]\n"
+      "uzp1 z0.d, z9.d, z12.d\n"
       "uzp2 z9.d, z9.d, z12.d\n"
-      "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
-      "uzp1 z12.d, z10.d, z13.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "uzp1 z0.d, z10.d, z13.d\n"
       "uzp2 z10.d, z10.d, z13.d\n"
-      "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
       "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
-      "uzp1 z13.d, z14.d, z17.d\n"
+      "uzp1 z0.d, z14.d, z17.d\n"
       "uzp2 z14.d, z14.d, z17.d\n"
       "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
-      "uzp1 z17.d, z15.d, z18.d\n"
+      "uzp1 z1.d, z15.d, z18.d\n"
       "subs x23, x23, #0x1\n"
       "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
       "uzp2 z15.d, z15.d, z18.d\n"
-      "uzp1 z18.d, z16.d, z19.d\n"
-      "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "uzp1 z17.d, z16.d, z19.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
       "uzp2 z16.d, z16.d, z19.d\n"
-      "uzp1 z19.d, z20.d, z23.d\n"
-      "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "uzp1 z0.d, z20.d, z23.d\n"
+      "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
       "addvl %x[Cpanel], %x[Cpanel], #16\n"
       "uzp2 z20.d, z20.d, z23.d\n"
-      "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
       "uzp1 z23.d, z21.d, z24.d\n"
       "uzp2 z21.d, z21.d, z24.d\n"
       "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
-      "uzp1 z24.d, z22.d, z25.d\n"
+      "uzp1 z19.d, z22.d, z25.d\n"
       "uzp2 z22.d, z22.d, z25.d\n"
       "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
-      "uzp1 z25.d, z26.d, z29.d\n"
+      "uzp1 z18.d, z26.d, z29.d\n"
       "uzp2 z26.d, z26.d, z29.d\n"
       "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
-      "uzp1 z29.d, z27.d, z30.d\n"
+      "uzp1 z17.d, z27.d, z30.d\n"
       "uzp2 z27.d, z27.d, z30.d\n"
-      "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
-      "uzp1 z30.d, z28.d, z31.d\n"
+      "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "uzp1 z16.d, z28.d, z31.d\n"
       "uzp2 z28.d, z28.d, z31.d\n"
       "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
-      "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
       "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
       "st1w { z21.s }, p0, [%x[Cpanel]]\n"
       "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
-      "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
-      "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
-      "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
       "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
       "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
       "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
@@ -290,4 +294,4 @@
 }
 
 } // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/misc.cpp b/src/core/NEON/kernels/arm_gemm/misc.cpp
index cf99bbd..87310d9 100644
--- a/src/core/NEON/kernels/arm_gemm/misc.cpp
+++ b/src/core/NEON/kernels/arm_gemm/misc.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018, 2022 Arm Limited.
+ * Copyright (c) 2017-2018, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,10 +56,14 @@
         wf_i |= 0x10;
     }
 
+#ifdef ARM_COMPUTE_ENABLE_SVE
     // Get total bytes in vector output
     if (kwf_i & 0x1) {
         vector_bytes = vector_count * get_vector_length<uint8_t>();
     } else {
+#else
+    if (1) {
+#endif
         vector_bytes = vector_count * 16;
     }
 
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.hpp b/src/core/NEON/kernels/arm_gemm/quantized.hpp
index 3f34430..31dd65b 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,7 +42,7 @@
                       unsigned int multi, unsigned int first_col);
 
 template<typename T>
-void row_sums_indirect(unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<T> A_arg,
+void row_sums_indirect(size_t num_strings, const unsigned int *string_lengths, IndirectInputArg<T> A_arg,
                        size_t M, int32_t *output_ptr, const Requantize32 *qp);
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
index 7345793..94cd7dd 100644
--- a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
@@ -34,7 +34,7 @@
 
 template<>
 void row_sums_indirect(
-    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
     size_t M, int32_t *out_ptr, const Requantize32 *qp
 )
 {
diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
index ff95507..2ab0397 100644
--- a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
@@ -34,7 +34,7 @@
 
 template<>
 void row_sums_indirect(
-    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
     size_t M, int32_t *out_ptr, const Requantize32 *qp
 )
 {
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp
index ae452e1..afe24e7 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,9 +67,8 @@
     }
 
     template<typename TOut>
-    void Merge(TOut *out, const TResult *in, int stride, int y0, int ymax, int x0, int xmax, const TOut *bias, const Activation act, bool accumulate) {
+    void Merge(TOut *, const TResult *, int, int, int, int, int, const TOut *, const Activation, bool) {
         // Separate merge not supported for SME.
-        ARM_COMPUTE_UNUSED(out, in, stride, y0, ymax, x0, xmax, bias, act, accumulate);
     }
 };
 
diff --git a/src/core/NEON/kernels/arm_gemm/transform.cpp b/src/core/NEON/kernels/arm_gemm/transform.cpp
index ef5a01a..5aa62f0 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.cpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -132,7 +132,9 @@
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 template void Transform<8, 1, true, VLType::None>(float *, const __fp16 *, int, int, int, int, int);
 #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#ifdef ARM_COMPUTE_ENABLE_BF16
 template void Transform<8, 1, true, VLType::None>(float *, const bfloat16 *, int, int, int, int, int);
+#endif
 #endif // AArch32
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
index e618698..8574d89 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -193,7 +193,6 @@
       "bge 1b\n"
       "cbz %x[height], 20f\n"
       "10:"  // Main loop skip
-
       "11:"  // Tail row loop: Head
       "mov x20, %x[width]\n"
       "mov x25, %x[in]\n"
@@ -264,7 +263,6 @@
       "add %x[out], %x[out], #0x80\n"
       "bge 11b\n"
       "20:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -286,4 +284,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
index 6d97f71..cdf1f98 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -427,4 +427,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
index 96d132b..da0809d 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -39,7 +39,6 @@
     size_t out_stride = 12 * roundup<size_t>(height, 8) * sizeof(uint8_t);
 
     __asm__ __volatile__(
-
       "1:"  // Main row loop: Head
       "mov x9, %x[in]\n"
       "add x28, x9, %x[in_stride]\n"
@@ -332,4 +331,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
index 04af6fd..cef468e 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -236,7 +236,6 @@
       "bge 1b\n"
       "cbz %x[height], 20f\n"
       "10:"  // Main loop skip
-
       "11:"  // Tail row loop: Head
       "mov x9, %x[in]\n"
       "mov x20, %x[width]\n"
@@ -319,7 +318,6 @@
       "add %x[out], %x[out], #0x30\n"
       "bge 11b\n"
       "20:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -341,4 +339,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
index e6ddc10..4c02d05 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -276,7 +276,6 @@
       "bge 1b\n"
       "cbz %x[height], 20f\n"
       "10:"  // Main loop skip
-
       "11:"  // Tail row loop: Head
       "mov x9, %x[in]\n"
       "add x28, x9, %x[in_stride]\n"
@@ -420,7 +419,6 @@
       "add %x[out], %x[out], #0x60\n"
       "bge 11b\n"
       "20:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -442,4 +440,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
index e487d4d..2a3208d 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -710,7 +710,6 @@
       "add %x[out], %x[out], #0x60\n"
       "bge 11b\n"
       "20:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -731,4 +730,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
index 7938325..4d9d5e7 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -182,7 +182,6 @@
       "bge 1b\n"
       "cbz %x[height], 20f\n"
       "10:"  // Main loop skip
-
       "11:"  // Tail row loop: Head
       "mov x20, %x[width]\n"
       "mov x25, %x[in]\n"
@@ -251,7 +250,6 @@
       "add %x[out], %x[out], #0x18\n"
       "bge 11b\n"
       "20:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -272,4 +270,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
index 4c66fb2..b0cd7e4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -182,7 +182,6 @@
       "bge 1b\n"
       "cbz %x[height], 20f\n"
       "10:"  // Main loop skip
-
       "11:"  // Tail row loop: Head
       "mov x20, %x[width]\n"
       "mov x25, %x[in]\n"
@@ -251,7 +250,6 @@
       "add %x[out], %x[out], #0x18\n"
       "bge 11b\n"
       "20:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -272,4 +270,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp
index f06c167..0399f8b 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -137,4 +137,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
index e0ccb36..f3a1dde 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -327,4 +327,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
index fa45f4f..7c7e91e 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -39,7 +39,6 @@
     size_t out_stride = 16 * roundup<size_t>(height, 8) * sizeof(uint8_t);
 
     __asm__ __volatile__(
-
       "1:"  // Main row loop: Head
       "mov x9, %x[in]\n"
       "add x28, x9, %x[in_stride]\n"
@@ -288,4 +287,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
index 06efa97..b4515cb 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -163,7 +163,6 @@
       "bge 1b\n"
       "cbz %x[height], 16f\n"
       "8:"  // Main loop skip
-
       "9:"  // Tail row loop: Head
       "mov x9, %x[in]\n"
       "mov x20, %x[width]\n"
@@ -221,7 +220,6 @@
       "add %x[out], %x[out], #0x40\n"
       "bge 9b\n"
       "16:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -243,4 +241,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
index dafa53e..ac67467 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -320,7 +320,6 @@
       "bge 1b\n"
       "cbz %x[height], 20f\n"
       "10:"  // Main loop skip
-
       "11:"  // Tail row loop: Head
       "mov x9, %x[in]\n"
       "add x28, x9, %x[in_stride]\n"
@@ -486,7 +485,6 @@
       "add %x[out], %x[out], #0x80\n"
       "bge 11b\n"
       "20:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -508,4 +506,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
index e012d09..b9fe8b1 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -281,7 +281,6 @@
       "bge 1b\n"
       "cbz %x[height], 16f\n"
       "8:"  // Main loop skip
-
       "9:"  // Tail row loop: Head
       "mov x9, %x[in]\n"
       "add x28, x9, %x[in_stride]\n"
@@ -423,7 +422,6 @@
       "add %x[out], %x[out], #0x80\n"
       "bge 9b\n"
       "16:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -444,4 +442,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
index 20f9d39..46211ad 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -158,7 +158,6 @@
       "bge 1b\n"
       "cbz %x[height], 20f\n"
       "10:"  // Main loop skip
-
       "11:"  // Tail row loop: Head
       "mov x20, %x[width]\n"
       "mov x25, %x[in]\n"
@@ -268,4 +267,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
index 22d68ac..1cb7bc4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -762,7 +762,6 @@
       "add %x[out], %x[out], #0xc0\n"
       "bge 11b\n"
       "20:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -783,4 +782,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
index 799a9cd..dcaf69d 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -198,7 +198,6 @@
       "bge 1b\n"
       "cbz %x[height], 20f\n"
       "10:"  // Main loop skip
-
       "11:"  // Tail row loop: Head
       "mov x20, %x[width]\n"
       "mov x25, %x[in]\n"
@@ -271,7 +270,6 @@
       "add %x[out], %x[out], #0x30\n"
       "bge 11b\n"
       "20:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -292,4 +290,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
index 621c5f9..966b756 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -270,7 +270,6 @@
       "add %x[out], %x[out], #0x30\n"
       "bge 11b\n"
       "20:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -291,4 +290,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
index 5cd7bd0..4a22675 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -503,4 +503,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
index 706d7cd..2375366 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -280,7 +280,6 @@
       "bge 1b\n"
       "cbz %x[height], 24f\n"
       "12:"  // Main loop skip
-
       "13:"  // Tail row loop: Head
       "mov x25, %x[in]\n"
       "mov x20, %x[width]\n"
@@ -427,7 +426,6 @@
       "add %x[out], %x[out], #0x80\n"
       "bge 13b\n"
       "24:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -449,4 +447,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
index b482752..f35752d 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -137,7 +137,6 @@
       "bge 1b\n"
       "cbz %x[height], 20f\n"
       "10:"  // Main loop skip
-
       "11:"  // Tail row loop: Head
       "mov x20, %x[width]\n"
       "mov x25, %x[in]\n"
@@ -241,4 +240,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
index e1ab14e..6ef02ac 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -39,7 +39,6 @@
     size_t out_stride = 4 * roundup<size_t>(height, 16) * sizeof(uint8_t);
 
     __asm__ __volatile__(
-
       "1:"  // Main row loop: Head
       "mov x17, %x[in]\n"
       "add x16, x17, %x[in_stride]\n"
@@ -316,4 +315,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
index 8adc69e..5667820 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -333,4 +333,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
index 07602bd..328274a 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -145,7 +145,6 @@
       "bge 1b\n"
       "cbz %x[height], 20f\n"
       "10:"  // Main loop skip
-
       "11:"  // Tail row loop: Head
       "mov x20, %x[width]\n"
       "mov x25, %x[in]\n"
@@ -251,4 +250,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
index a048fbb..feb469a 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 
 namespace {
 
@@ -177,7 +177,6 @@
       "bge 1b\n"
       "cbz %x[height], 20f\n"
       "10:"  // Main loop skip
-
       "11:"  // Tail row loop: Head
       "mov x20, %x[width]\n"
       "mov x25, %x[in]\n"
@@ -265,4 +264,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp
index 01921c5..a4d480c 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -140,4 +139,5 @@
     );
 }
 
-#endif // __ARM_FEATURE_SVE
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp
index 6b9b471..552abfc 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -178,4 +177,5 @@
     );
 }
 
-#endif // __ARM_FEATURE_SVE
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp
index 96128cf..9c6f5c8 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -150,4 +149,5 @@
     );
 }
 
-#endif // __ARM_FEATURE_SVE
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp
index 080db1c..2756327 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -197,4 +196,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp
index 7e49609..a6ddb8f 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -210,4 +209,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp
index 45d3c07..399a52e 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -146,4 +145,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp
index 7120d1d..6318e29 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -206,4 +205,4 @@
     );
 }
 
-#endif
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp
index 72e7b0c..b900630 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -219,4 +218,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp
index a057fd5..f827197 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -222,4 +221,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp
index 9eb4075..c471d66 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -146,4 +145,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp
index 3fc3920..5f967fa 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -208,4 +207,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp
index 9d402a2..f22b833 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -236,4 +235,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp
index 362bebb..14636e3 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -185,4 +184,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp
index cbcc0b4..2d46a48 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -121,4 +120,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp
index 9b28578..002a124 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -168,4 +167,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp
index 8873070..2a43f34 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
 
 namespace {
 
@@ -186,4 +185,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp
new file mode 100644
index 0000000..be9ad66
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_8VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 8 * height * sme::get_vector_length<uint8_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cmp %x[height], #0x2\n"
+      "ptrue p7.b\n"
+      "blt 4f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "mov x23, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x22, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x21, x22\n"
+      "whilelt p0.h, XZR, x21\n"
+      "ld1h { z31.h }, p0/Z, [x25]\n"
+      "dech x21\n"
+      "whilelt p6.h, XZR, x21\n"
+      "ld1h { z30.h }, p6/Z, [x25, #1, MUL VL]\n"
+      "dech x21\n"
+      "whilelt p5.h, XZR, x21\n"
+      "ld1h { z29.h }, p5/Z, [x25, #2, MUL VL]\n"
+      "dech x21\n"
+      "whilelt p4.h, XZR, x21\n"
+      "ld1h { z28.h }, p4/Z, [x25, #3, MUL VL]\n"
+      "dech x21\n"
+      "whilelt p3.h, XZR, x21\n"
+      "ld1h { z27.h }, p3/Z, [x25, #4, MUL VL]\n"
+      "dech x21\n"
+      "whilelt p2.h, XZR, x21\n"
+      "ld1h { z26.h }, p2/Z, [x25, #5, MUL VL]\n"
+      "dech x21\n"
+      "whilelt p1.h, XZR, x21\n"
+      "ld1h { z25.h }, p1/Z, [x25, #6, MUL VL]\n"
+      "dech x21\n"
+      "mov x20, x23\n"
+      "ld1h { z24.h }, p0/Z, [x24]\n"
+      "whilelt p0.h, XZR, x21\n"
+      "dech x22, ALL, MUL #8\n"
+      "ld1h { z23.h }, p0/Z, [x25, #7, MUL VL]\n"
+      "ld1h { z22.h }, p6/Z, [x24, #1, MUL VL]\n"
+      "cmp x22, #0x0\n"
+      "addvl x25, x25, #8\n"
+      "ld1h { z21.h }, p5/Z, [x24, #2, MUL VL]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "ld1h { z20.h }, p4/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z19.h }, p3/Z, [x24, #4, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x24, #5, MUL VL]\n"
+      "ld1h { z17.h }, p1/Z, [x24, #6, MUL VL]\n"
+      "ld1h { z16.h }, p0/Z, [x24, #7, MUL VL]\n"
+      "st1h { z31.h }, p7, [x20]\n"
+      "addvl x24, x24, #8\n"
+      "st1h { z30.h }, p7, [x20, #1, MUL VL]\n"
+      "st1h { z29.h }, p7, [x20, #2, MUL VL]\n"
+      "st1h { z28.h }, p7, [x20, #3, MUL VL]\n"
+      "st1h { z27.h }, p7, [x20, #4, MUL VL]\n"
+      "st1h { z26.h }, p7, [x20, #5, MUL VL]\n"
+      "st1h { z25.h }, p7, [x20, #6, MUL VL]\n"
+      "st1h { z23.h }, p7, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1h { z24.h }, p7, [x20, #-8, MUL VL]\n"
+      "st1h { z22.h }, p7, [x20, #-7, MUL VL]\n"
+      "st1h { z21.h }, p7, [x20, #-6, MUL VL]\n"
+      "st1h { z20.h }, p7, [x20, #-5, MUL VL]\n"
+      "st1h { z19.h }, p7, [x20, #-4, MUL VL]\n"
+      "st1h { z18.h }, p7, [x20, #-3, MUL VL]\n"
+      "st1h { z17.h }, p7, [x20, #-2, MUL VL]\n"
+      "st1h { z16.h }, p7, [x20, #-1, MUL VL]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x2\n"
+      "addvl %x[out], %x[out], #16\n"
+      "bge 1b\n"
+      "cbz %x[height], 8f\n"
+      "4:"  // Main loop skip
+      "5:"  // Tail row loop: Head
+      "mov x25, %x[in]\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "mov x23, %x[out]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x21, %x[width]\n"
+      "6:"  // Tail row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z23.h }, p0/Z, [x25]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z22.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z21.h }, p0/Z, [x25, #2, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z20.h }, p0/Z, [x25, #3, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z19.h }, p0/Z, [x25, #4, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z18.h }, p0/Z, [x25, #5, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z17.h }, p0/Z, [x25, #6, MUL VL]\n"
+      "dech x20\n"
+      "dech x21, ALL, MUL #8\n"
+      "whilelt p0.h, XZR, x20\n"
+      "cmp x21, #0x0\n"
+      "ld1h { z16.h }, p0/Z, [x25, #7, MUL VL]\n"
+      "st1h { z23.h }, p7, [x23]\n"
+      "addvl x25, x25, #8\n"
+      "st1h { z22.h }, p7, [x23, #1, MUL VL]\n"
+      "st1h { z21.h }, p7, [x23, #2, MUL VL]\n"
+      "st1h { z20.h }, p7, [x23, #3, MUL VL]\n"
+      "st1h { z19.h }, p7, [x23, #4, MUL VL]\n"
+      "st1h { z18.h }, p7, [x23, #5, MUL VL]\n"
+      "st1h { z17.h }, p7, [x23, #6, MUL VL]\n"
+      "st1h { z16.h }, p7, [x23, #7, MUL VL]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bgt 6b\n"
+      "7:"  // Tail row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 5b\n"
+      "8:"  // Done
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 1, true, VLType::SME>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_8VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<8, 1, true, VLType::SME>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_8VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<8, 1, true, VLType::SME>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_8VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp
new file mode 100644
index 0000000..45d2e24
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_8VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 8 * roundup<size_t>(height, 4) * sme::get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p2.b\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "csel x23, x23, %x[pad_row], GT\n"
+      "csel x24, x24, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "mov x22, %x[out]\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x21, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p1.b, XZR, x20\n"
+      "ld1b { z19.b }, p1/Z, [x26]\n"
+      "decb x20\n"
+      "whilelt p0.b, XZR, x20\n"
+      "ld1b { z17.b }, p0/Z, [x26, #1, MUL VL]\n"
+      "ld1b { z18.b }, p1/Z, [x25]\n"
+      "decw x21, ALL, MUL #8\n"
+      "cmp x21, #0x0\n"
+      "ld1b { z21.b }, p0/Z, [x25, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "addvl x25, x25, #2\n"
+      "ld1b { z16.b }, p1/Z, [x24]\n"
+      "zip1 z24.b, z19.b, z16.b\n"
+      "zip2 z20.b, z19.b, z16.b\n"
+      "ld1b { z16.b }, p0/Z, [x24, #1, MUL VL]\n"
+      "zip1 z23.b, z17.b, z16.b\n"
+      "zip2 z22.b, z17.b, z16.b\n"
+      "addvl x24, x24, #2\n"
+      "ld1b { z16.b }, p1/Z, [x23]\n"
+      "zip1 z17.b, z18.b, z16.b\n"
+      "zip2 z19.b, z18.b, z16.b\n"
+      "ld1b { z16.b }, p0/Z, [x23, #1, MUL VL]\n"
+      "zip1 z18.b, z21.b, z16.b\n"
+      "zip2 z21.b, z21.b, z16.b\n"
+      "addvl x23, x23, #2\n"
+      "zip1 z16.b, z24.b, z17.b\n"
+      "zip2 z17.b, z24.b, z17.b\n"
+      "st1b { z16.b }, p2, [x22]\n"
+      "zip1 z16.b, z20.b, z19.b\n"
+      "zip2 z20.b, z20.b, z19.b\n"
+      "st1b { z17.b }, p2, [x22, #1, MUL VL]\n"
+      "zip1 z19.b, z23.b, z18.b\n"
+      "zip2 z18.b, z23.b, z18.b\n"
+      "st1b { z16.b }, p2, [x22, #2, MUL VL]\n"
+      "zip1 z17.b, z22.b, z21.b\n"
+      "zip2 z16.b, z22.b, z21.b\n"
+      "st1b { z20.b }, p2, [x22, #3, MUL VL]\n"
+      "st1b { z19.b }, p2, [x22, #4, MUL VL]\n"
+      "st1b { z18.b }, p2, [x22, #5, MUL VL]\n"
+      "st1b { z17.b }, p2, [x22, #6, MUL VL]\n"
+      "st1b { z16.b }, p2, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 1b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 4, true, VLType::SME>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_8VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<8, 4, true, VLType::SME>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_8VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp
new file mode 100644
index 0000000..ec7c415
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 8 * roundup<size_t>(height, 2) * sme::get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p4.b\n"
+      "1:"  // Main row loop: Head
+      "mov x24, %x[in]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "mov x22, %x[out]\n"
+      "csel x23, x23, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x21, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x20, x21\n"
+      "whilelt p3.h, XZR, x20\n"
+      "ld1h { z20.h }, p3/Z, [x24]\n"
+      "dech x20\n"
+      "whilelt p2.h, XZR, x20\n"
+      "ld1h { z19.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "dech x20\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z24.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z17.h }, p3/Z, [x23]\n"
+      "decw x21, ALL, MUL #8\n"
+      "cmp x21, #0x0\n"
+      "zip1 z23.h, z20.h, z17.h\n"
+      "ld1h { z16.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      "zip2 z22.h, z20.h, z17.h\n"
+      "zip1 z21.h, z19.h, z16.h\n"
+      "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "zip2 z20.h, z19.h, z16.h\n"
+      "zip1 z19.h, z18.h, z17.h\n"
+      "ld1h { z16.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "zip1 z17.h, z24.h, z16.h\n"
+      "zip2 z16.h, z24.h, z16.h\n"
+      "st1h { z23.h }, p4, [x22]\n"
+      "st1h { z22.h }, p4, [x22, #1, MUL VL]\n"
+      "st1h { z21.h }, p4, [x22, #2, MUL VL]\n"
+      "st1h { z20.h }, p4, [x22, #3, MUL VL]\n"
+      "st1h { z19.h }, p4, [x22, #4, MUL VL]\n"
+      "st1h { z18.h }, p4, [x22, #5, MUL VL]\n"
+      "st1h { z17.h }, p4, [x22, #6, MUL VL]\n"
+      "st1h { z16.h }, p4, [x22, #7, MUL VL]\n"
+      "add x22, x22, %x[out_stride]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 1b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 2, true, VLType::SME>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_8VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<8, 2, true, VLType::SME>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sme_transpose_interleave_8VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
index 8477189..f627fe5 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -373,4 +372,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
index 74fce4d..b33c4f6 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -101,7 +100,6 @@
       "bge 1b\n"
       "cbz %x[height], 12f\n"
       "6:"  // Main loop skip
-
       "7:"  // Tail row loop: Head
       "mov x21, %x[width]\n"
       "cntw x20, ALL, MUL #2\n"
@@ -138,7 +136,6 @@
       "addvl %x[out], %x[out], #1\n"
       "bge 7b\n"
       "12:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
       : "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23"
@@ -160,4 +157,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
index a034be5..e468787 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -305,4 +304,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
index 82d4184..546800f 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -93,7 +92,6 @@
       "bge 1b\n"
       "cbz %x[height], 8f\n"
       "4:"  // Main loop skip
-
       "5:"  // Tail row loop: Head
       "mov x26, %x[in]\n"
       "add %x[in], x26, %x[in_stride]\n"
@@ -123,7 +121,6 @@
       "addvl %x[out], %x[out], #3\n"
       "bge 5b\n"
       "8:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
       : "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
@@ -171,4 +168,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
index ec7095d..a44141c 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -329,7 +328,6 @@
       "addvl %x[out], %x[out], #3\n"
       "bge 7b\n"
       "12:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -364,4 +362,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
index 3d14383..36a15a1 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -292,7 +291,6 @@
       "addvl %x[out], %x[out], #3\n"
       "bge 7b\n"
       "12:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -314,4 +312,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
index a392351..e661e26 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -103,7 +102,6 @@
       "bge 1b\n"
       "cbz %x[height], 8f\n"
       "4:"  // Main loop skip
-
       "5:"  // Tail row loop: Head
       "mov x26, %x[in]\n"
       "add %x[in], x26, %x[in_stride]\n"
@@ -137,7 +135,6 @@
       "addvl %x[out], %x[out], #4\n"
       "bge 5b\n"
       "8:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -185,4 +182,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
index e348939..03a78f7 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -317,4 +316,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
index 9505dc5..b196799 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -248,7 +247,6 @@
       "bge 1b\n"
       "cbz %x[height], 12f\n"
       "6:"  // Main loop skip
-
       "7:"  // Tail row loop: Head
       "mov x12, %x[in]\n"
       "mov x21, %x[width]\n"
@@ -323,7 +321,6 @@
       "addvl %x[out], %x[out], #4\n"
       "bge 7b\n"
       "12:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -345,4 +342,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
index 982c054..68fe2d0 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -292,4 +291,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
index 2b5741a..910fc6c 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -260,7 +259,6 @@
       "bge 1b\n"
       "cbz %x[height], 12f\n"
       "6:"  // Main loop skip
-
       "7:"  // Tail row loop: Head
       "mov x12, %x[in]\n"
       "add x11, x12, %x[in_stride]\n"
@@ -386,7 +384,6 @@
       "addvl %x[out], %x[out], #6\n"
       "bge 7b\n"
       "12:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -408,4 +405,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
index 146da33..f0f10d2 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -235,4 +234,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
index f6fc5e8..c638eaa 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -296,7 +295,6 @@
       "addvl %x[out], %x[out], #6\n"
       "bge 7b\n"
       "12:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -318,4 +316,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
index 07147ac..0526bd0 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -281,7 +280,6 @@
       "addvl %x[out], %x[out], #8\n"
       "bge 7b\n"
       "12:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
       : "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -303,4 +301,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
index 3ba50fe..98f0770 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -283,4 +282,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
index 6b5ca38..3fa5292 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -256,4 +255,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
index 237e9b6..02977ec 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -354,7 +353,6 @@
       "addvl %x[out], %x[out], #8\n"
       "bge 7b\n"
       "12:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -376,4 +374,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
index 51cae7d..34799c6 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -439,7 +438,6 @@
       "addvl %x[out], %x[out], #8\n"
       "bge 7b\n"
       "12:"  // Done
-
       : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
       : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
       : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -461,4 +459,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
index 4ad8828..5a48e57 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
 
 namespace {
 
@@ -279,4 +278,5 @@
     );
 }
 
-#endif
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index a28ddad..11b1bd3 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -80,7 +80,8 @@
 enum class VLType {
     None,
     SVE,
-    SME
+    SME,
+    SME2
 };
 
 template<typename T>